wang commited on
Commit
464108b
·
verified ·
1 Parent(s): 8f317cc

Upload 404 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. chinese_medical_ner/ccksyidu4k-ner-roformer/._README.md +0 -0
  3. chinese_medical_ner/ccksyidu4k-ner-roformer/.gitignore +158 -0
  4. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/NamedEntityRecognization.iml +12 -0
  5. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/csv-plugin.xml +16 -0
  6. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/deployment.xml +14 -0
  7. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/inspectionProfiles/Project_Default.xml +178 -0
  8. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/inspectionProfiles/profiles_settings.xml +6 -0
  9. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/misc.xml +4 -0
  10. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/modules.xml +8 -0
  11. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/remote-mappings.xml +16 -0
  12. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/vcs.xml +6 -0
  13. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/webServers.xml +14 -0
  14. chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/workspace.xml +256 -0
  15. chinese_medical_ner/ccksyidu4k-ner-roformer/README.md +340 -0
  16. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/config.cpython-310.pyc +0 -0
  17. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/config.cpython-37.pyc +0 -0
  18. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/config.cpython-38.pyc +0 -0
  19. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/evaluate.cpython-310.pyc +0 -0
  20. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/evaluate.cpython-37.pyc +0 -0
  21. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/evaluate.cpython-38.pyc +0 -0
  22. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/model.cpython-310.pyc +0 -0
  23. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/model.cpython-37.pyc +0 -0
  24. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/model.cpython-38.pyc +0 -0
  25. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/path.cpython-310.pyc +0 -0
  26. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/path.cpython-37.pyc +0 -0
  27. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/path.cpython-38.pyc +0 -0
  28. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/plot.cpython-310.pyc +0 -0
  29. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/plot.cpython-37.pyc +0 -0
  30. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/plot.cpython-38.pyc +0 -0
  31. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/predict.cpython-37.pyc +0 -0
  32. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/preprocess.cpython-310.pyc +0 -0
  33. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/preprocess.cpython-37.pyc +0 -0
  34. chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/preprocess.cpython-38.pyc +0 -0
  35. chinese_medical_ner/ccksyidu4k-ner-roformer/calc_bert_matrix.ipynb +534 -0
  36. chinese_medical_ner/ccksyidu4k-ner-roformer/config.py +9 -0
  37. chinese_medical_ner/ccksyidu4k-ner-roformer/cudnn-7.6.5-cuda10.0_0.conda +3 -0
  38. chinese_medical_ner/ccksyidu4k-ner-roformer/data/chip.train +0 -0
  39. chinese_medical_ner/ccksyidu4k-ner-roformer/data/chip.validate +0 -0
  40. chinese_medical_ner/ccksyidu4k-ner-roformer/data/yidu.test +0 -0
  41. chinese_medical_ner/ccksyidu4k-ner-roformer/data/yidu.train +0 -0
  42. chinese_medical_ner/ccksyidu4k-ner-roformer/data/yidu.validate +0 -0
  43. chinese_medical_ner/ccksyidu4k-ner-roformer/evaluate.py +154 -0
  44. chinese_medical_ner/ccksyidu4k-ner-roformer/evaluate_ner.py +359 -0
  45. chinese_medical_ner/ccksyidu4k-ner-roformer/images/chip_train_acc.png +0 -0
  46. chinese_medical_ner/ccksyidu4k-ner-roformer/images/chip_train_loss.png +0 -0
  47. chinese_medical_ner/ccksyidu4k-ner-roformer/images/chip_val_f1.png +0 -0
  48. chinese_medical_ner/ccksyidu4k-ner-roformer/images/downstream.png +3 -0
  49. chinese_medical_ner/ccksyidu4k-ner-roformer/images/model.jpg +3 -0
  50. chinese_medical_ner/ccksyidu4k-ner-roformer/images/yidu_train_acc.png +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chinese_medical_ner/ccksyidu4k-ner-roformer/cudnn-7.6.5-cuda10.0_0.conda filter=lfs diff=lfs merge=lfs -text
37
+ chinese_medical_ner/ccksyidu4k-ner-roformer/images/downstream.png filter=lfs diff=lfs merge=lfs -text
38
+ chinese_medical_ner/ccksyidu4k-ner-roformer/images/model.jpg filter=lfs diff=lfs merge=lfs -text
chinese_medical_ner/ccksyidu4k-ner-roformer/._README.md ADDED
Binary file (4.1 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/.gitignore ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### JupyterNotebooks template
2
+ # gitignore template for Jupyter Notebooks
3
+ # website: http://jupyter.org/
4
+
5
+ .ipynb_checkpoints
6
+ */.ipynb_checkpoints/*
7
+
8
+ # IPython
9
+ profile_default/
10
+ ipython_config.py
11
+
12
+ # Remove previous ipynb_checkpoints
13
+ # git rm -r .ipynb_checkpoints/
14
+
15
+ ### Python template
16
+ # Byte-compiled / optimized / DLL files
17
+ __pycache__/
18
+ *.py[cod]
19
+ *$py.class
20
+
21
+ # C extensions
22
+ *.so
23
+
24
+ # Distribution / packaging
25
+ .Python
26
+ build/
27
+ develop-eggs/
28
+ dist/
29
+ downloads/
30
+ eggs/
31
+ .eggs/
32
+ lib/
33
+ lib64/
34
+ parts/
35
+ sdist/
36
+ var/
37
+ wheels/
38
+ share/python-wheels/
39
+ *.egg-info/
40
+ .installed.cfg
41
+ *.egg
42
+ MANIFEST
43
+
44
+ # PyInstaller
45
+ # Usually these files are written by a python script from a template
46
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
47
+ *.manifest
48
+ *.spec
49
+
50
+ # Installer logs
51
+ pip-log.txt
52
+ pip-delete-this-directory.txt
53
+
54
+ # Unit test / coverage reports
55
+ htmlcov/
56
+ .tox/
57
+ .nox/
58
+ .coverage
59
+ .coverage.*
60
+ .cache
61
+ nosetests.xml
62
+ coverage.xml
63
+ *.cover
64
+ *.py,cover
65
+ .hypothesis/
66
+ .pytest_cache/
67
+ cover/
68
+
69
+ # Translations
70
+ *.mo
71
+ *.pot
72
+
73
+ # Django stuff:
74
+ *.log
75
+ local_settings.py
76
+ db.sqlite3
77
+ db.sqlite3-journal
78
+
79
+ # Flask stuff:
80
+ instance/
81
+ .webassets-cache
82
+
83
+ # Scrapy stuff:
84
+ .scrapy
85
+
86
+ # Sphinx documentation
87
+ docs/_build/
88
+
89
+ # PyBuilder
90
+ .pybuilder/
91
+ target/
92
+
93
+ # Jupyter Notebook
94
+ .ipynb_checkpoints
95
+
96
+ # IPython
97
+ profile_default/
98
+ ipython_config.py
99
+
100
+ # pyenv
101
+ # For a library or package, you might want to ignore these files since the code is
102
+ # intended to run in multiple environments; otherwise, check them in:
103
+ # .python-version
104
+
105
+ # pipenv
106
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
108
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
109
+ # install all needed dependencies.
110
+ #Pipfile.lock
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ chinese_roformer-v2-char_L-6_H-384_A-6/*.ckpt*
156
+ chinese_roformer-v2-char_L-12_H-768_A-12/*.ckpt*
157
+ weights/*.h5
158
+ data/lung.*
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/NamedEntityRecognization.iml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="jdk" jdkName="Python 3.7 (tf_v1)" jdkType="Python SDK" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ <component name="PyDocumentationSettings">
9
+ <option name="format" value="PLAIN" />
10
+ <option name="myDocStringFormat" value="Plain" />
11
+ </component>
12
+ </module>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/csv-plugin.xml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="CsvFileAttributes">
4
+ <option name="attributeMap">
5
+ <map>
6
+ <entry key="/report/yidu_bert_base.csv">
7
+ <value>
8
+ <Attribute>
9
+ <option name="separator" value="," />
10
+ </Attribute>
11
+ </value>
12
+ </entry>
13
+ </map>
14
+ </option>
15
+ </component>
16
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/deployment.xml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="PublishConfigData" serverName="NamedEntityRecognization" createEmptyFolders="true" remoteFilesAllowedToDisappearOnAutoupload="false">
4
+ <serverData>
5
+ <paths name="NamedEntityRecognization">
6
+ <serverdata>
7
+ <mappings>
8
+ <mapping deploy="/home/bureaux/Projects/NamedEntityRecognization" local="$PROJECT_DIR$" web="/" />
9
+ </mappings>
10
+ </serverdata>
11
+ </paths>
12
+ </serverData>
13
+ </component>
14
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
5
+ <Languages>
6
+ <language minSize="147" name="Python" />
7
+ </Languages>
8
+ </inspection_tool>
9
+ <inspection_tool class="JupyterPackageInspection" enabled="false" level="WARNING" enabled_by_default="false" />
10
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="false" level="WARNING" enabled_by_default="false">
11
+ <option name="ignoredPackages">
12
+ <value>
13
+ <list size="127">
14
+ <item index="0" class="java.lang.String" itemvalue="h5py" />
15
+ <item index="1" class="java.lang.String" itemvalue="six" />
16
+ <item index="2" class="java.lang.String" itemvalue="keras-bert" />
17
+ <item index="3" class="java.lang.String" itemvalue="keras-transformer" />
18
+ <item index="4" class="java.lang.String" itemvalue="absl-py" />
19
+ <item index="5" class="java.lang.String" itemvalue="google-pasta" />
20
+ <item index="6" class="java.lang.String" itemvalue="protobuf" />
21
+ <item index="7" class="java.lang.String" itemvalue="decorator" />
22
+ <item index="8" class="java.lang.String" itemvalue="tensorflow-estimator" />
23
+ <item index="9" class="java.lang.String" itemvalue="joblib" />
24
+ <item index="10" class="java.lang.String" itemvalue="threadpoolctl" />
25
+ <item index="11" class="java.lang.String" itemvalue="opt-einsum" />
26
+ <item index="12" class="java.lang.String" itemvalue="scikit-learn" />
27
+ <item index="13" class="java.lang.String" itemvalue="PyYAML" />
28
+ <item index="14" class="java.lang.String" itemvalue="cycler" />
29
+ <item index="15" class="java.lang.String" itemvalue="gast" />
30
+ <item index="16" class="java.lang.String" itemvalue="numpy" />
31
+ <item index="17" class="java.lang.String" itemvalue="importlib-metadata" />
32
+ <item index="18" class="java.lang.String" itemvalue="Keras-Preprocessing" />
33
+ <item index="19" class="java.lang.String" itemvalue="tensorflow" />
34
+ <item index="20" class="java.lang.String" itemvalue="Pygments" />
35
+ <item index="21" class="java.lang.String" itemvalue="pyzmq" />
36
+ <item index="22" class="java.lang.String" itemvalue="certifi" />
37
+ <item index="23" class="java.lang.String" itemvalue="prompt-toolkit" />
38
+ <item index="24" class="java.lang.String" itemvalue="cached-property" />
39
+ <item index="25" class="java.lang.String" itemvalue="Markdown" />
40
+ <item index="26" class="java.lang.String" itemvalue="scipy" />
41
+ <item index="27" class="java.lang.String" itemvalue="Werkzeug" />
42
+ <item index="28" class="java.lang.String" itemvalue="opencv-python" />
43
+ <item index="29" class="java.lang.String" itemvalue="parso" />
44
+ <item index="30" class="java.lang.String" itemvalue="wrapt" />
45
+ <item index="31" class="java.lang.String" itemvalue="astor" />
46
+ <item index="32" class="java.lang.String" itemvalue="ipython" />
47
+ <item index="33" class="java.lang.String" itemvalue="kiwisolver" />
48
+ <item index="34" class="java.lang.String" itemvalue="typing-extensions" />
49
+ <item index="35" class="java.lang.String" itemvalue="jupyter-client" />
50
+ <item index="36" class="java.lang.String" itemvalue="ipykernel" />
51
+ <item index="37" class="java.lang.String" itemvalue="Keras-Applications" />
52
+ <item index="38" class="java.lang.String" itemvalue="appnope" />
53
+ <item index="39" class="java.lang.String" itemvalue="pandas" />
54
+ <item index="40" class="java.lang.String" itemvalue="termcolor" />
55
+ <item index="41" class="java.lang.String" itemvalue="tensorboard" />
56
+ <item index="42" class="java.lang.String" itemvalue="matplotlib" />
57
+ <item index="43" class="java.lang.String" itemvalue="grpcio" />
58
+ <item index="44" class="java.lang.String" itemvalue="Keras" />
59
+ <item index="45" class="java.lang.String" itemvalue="pytz" />
60
+ <item index="46" class="java.lang.String" itemvalue="Pillow" />
61
+ <item index="47" class="java.lang.String" itemvalue="seqeval" />
62
+ <item index="48" class="java.lang.String" itemvalue="keras-embed-sim" />
63
+ <item index="49" class="java.lang.String" itemvalue="sklearn" />
64
+ <item index="50" class="java.lang.String" itemvalue="keras-position-wise-feed-forward" />
65
+ <item index="51" class="java.lang.String" itemvalue="keras-pos-embd" />
66
+ <item index="52" class="java.lang.String" itemvalue="keras-self-attention" />
67
+ <item index="53" class="java.lang.String" itemvalue="keras-layer-normalization" />
68
+ <item index="54" class="java.lang.String" itemvalue="keras-multi-head" />
69
+ <item index="55" class="java.lang.String" itemvalue="jedi" />
70
+ <item index="56" class="java.lang.String" itemvalue="pyDeprecate" />
71
+ <item index="57" class="java.lang.String" itemvalue="pytorch-lightning" />
72
+ <item index="58" class="java.lang.String" itemvalue="aiohttp" />
73
+ <item index="59" class="java.lang.String" itemvalue="packaging" />
74
+ <item index="60" class="java.lang.String" itemvalue="torch" />
75
+ <item index="61" class="java.lang.String" itemvalue="pyparsing" />
76
+ <item index="62" class="java.lang.String" itemvalue="torchvision" />
77
+ <item index="63" class="java.lang.String" itemvalue="traitlets" />
78
+ <item index="64" class="java.lang.String" itemvalue="testpath" />
79
+ <item index="65" class="java.lang.String" itemvalue="pickleshare" />
80
+ <item index="66" class="java.lang.String" itemvalue="python-dateutil" />
81
+ <item index="67" class="java.lang.String" itemvalue="defusedxml" />
82
+ <item index="68" class="java.lang.String" itemvalue="nbclient" />
83
+ <item index="69" class="java.lang.String" itemvalue="QtPy" />
84
+ <item index="70" class="java.lang.String" itemvalue="MarkupSafe" />
85
+ <item index="71" class="java.lang.String" itemvalue="pycparser" />
86
+ <item index="72" class="java.lang.String" itemvalue="pyasn1-modules" />
87
+ <item index="73" class="java.lang.String" itemvalue="ipython-genutils" />
88
+ <item index="74" class="java.lang.String" itemvalue="jupyterlab-widgets" />
89
+ <item index="75" class="java.lang.String" itemvalue="bleach" />
90
+ <item index="76" class="java.lang.String" itemvalue="oauthlib" />
91
+ <item index="77" class="java.lang.String" itemvalue="astunparse" />
92
+ <item index="78" class="java.lang.String" itemvalue="entrypoints" />
93
+ <item index="79" class="java.lang.String" itemvalue="jsonschema" />
94
+ <item index="80" class="java.lang.String" itemvalue="notebook" />
95
+ <item index="81" class="java.lang.String" itemvalue="qtconsole" />
96
+ <item index="82" class="java.lang.String" itemvalue="terminado" />
97
+ <item index="83" class="java.lang.String" itemvalue="argcomplete" />
98
+ <item index="84" class="java.lang.String" itemvalue="tensorboard-data-server" />
99
+ <item index="85" class="java.lang.String" itemvalue="pexpect" />
100
+ <item index="86" class="java.lang.String" itemvalue="jupyterlab-pygments" />
101
+ <item index="87" class="java.lang.String" itemvalue="nbconvert" />
102
+ <item index="88" class="java.lang.String" itemvalue="attrs" />
103
+ <item index="89" class="java.lang.String" itemvalue="cn2an" />
104
+ <item index="90" class="java.lang.String" itemvalue="flatbuffers" />
105
+ <item index="91" class="java.lang.String" itemvalue="backcall" />
106
+ <item index="92" class="java.lang.String" itemvalue="widgetsnbextension" />
107
+ <item index="93" class="java.lang.String" itemvalue="charset-normalizer" />
108
+ <item index="94" class="java.lang.String" itemvalue="idna" />
109
+ <item index="95" class="java.lang.String" itemvalue="rsa" />
110
+ <item index="96" class="java.lang.String" itemvalue="jupyter-core" />
111
+ <item index="97" class="java.lang.String" itemvalue="tensorflow-addons" />
112
+ <item index="98" class="java.lang.String" itemvalue="matplotlib-inline" />
113
+ <item index="99" class="java.lang.String" itemvalue="ptyprocess" />
114
+ <item index="100" class="java.lang.String" itemvalue="cffi" />
115
+ <item index="101" class="java.lang.String" itemvalue="pandocfilters" />
116
+ <item index="102" class="java.lang.String" itemvalue="wcwidth" />
117
+ <item index="103" class="java.lang.String" itemvalue="pyasn1" />
118
+ <item index="104" class="java.lang.String" itemvalue="requests" />
119
+ <item index="105" class="java.lang.String" itemvalue="Jinja2" />
120
+ <item index="106" class="java.lang.String" itemvalue="typeguard" />
121
+ <item index="107" class="java.lang.String" itemvalue="pyrsistent" />
122
+ <item index="108" class="java.lang.String" itemvalue="requests-oauthlib" />
123
+ <item index="109" class="java.lang.String" itemvalue="jupyter" />
124
+ <item index="110" class="java.lang.String" itemvalue="tensorboard-plugin-wit" />
125
+ <item index="111" class="java.lang.String" itemvalue="zipp" />
126
+ <item index="112" class="java.lang.String" itemvalue="nest-asyncio" />
127
+ <item index="113" class="java.lang.String" itemvalue="urllib3" />
128
+ <item index="114" class="java.lang.String" itemvalue="ipywidgets" />
129
+ <item index="115" class="java.lang.String" itemvalue="tornado" />
130
+ <item index="116" class="java.lang.String" itemvalue="google-auth-oauthlib" />
131
+ <item index="117" class="java.lang.String" itemvalue="nbformat" />
132
+ <item index="118" class="java.lang.String" itemvalue="Send2Trash" />
133
+ <item index="119" class="java.lang.String" itemvalue="prometheus-client" />
134
+ <item index="120" class="java.lang.String" itemvalue="mistune" />
135
+ <item index="121" class="java.lang.String" itemvalue="jupyter-console" />
136
+ <item index="122" class="java.lang.String" itemvalue="cachetools" />
137
+ <item index="123" class="java.lang.String" itemvalue="debugpy" />
138
+ <item index="124" class="java.lang.String" itemvalue="argon2-cffi" />
139
+ <item index="125" class="java.lang.String" itemvalue="webencodings" />
140
+ <item index="126" class="java.lang.String" itemvalue="google-auth" />
141
+ </list>
142
+ </value>
143
+ </option>
144
+ </inspection_tool>
145
+ <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
146
+ <option name="ignoredErrors">
147
+ <list>
148
+ <option value="E501" />
149
+ <option value="E122" />
150
+ <option value="W292" />
151
+ </list>
152
+ </option>
153
+ </inspection_tool>
154
+ <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
155
+ <option name="ignoredErrors">
156
+ <list>
157
+ <option value="N803" />
158
+ <option value="N802" />
159
+ <option value="N806" />
160
+ </list>
161
+ </option>
162
+ </inspection_tool>
163
+ <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
164
+ <option name="ignoredIdentifiers">
165
+ <list>
166
+ <option value="utils.backend.keras" />
167
+ <option value="utils.backend.K" />
168
+ <option value="utils.backend.sparse_multilabel_categorical_crossentropy" />
169
+ </list>
170
+ </option>
171
+ </inspection_tool>
172
+ <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
173
+ <option name="processCode" value="true" />
174
+ <option name="processLiterals" value="true" />
175
+ <option name="processComments" value="true" />
176
+ </inspection_tool>
177
+ </profile>
178
+ </component>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (tf_v1)" project-jdk-type="Python SDK" />
4
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/NamedEntityRecognization.iml" filepath="$PROJECT_DIR$/.idea/NamedEntityRecognization.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/remote-mappings.xml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="RemoteMappingsManager">
4
+ <list>
5
+ <list>
6
+ <remote-mappings server-id="python@sftp://bureaux@180.169.131.147:22/home/bureaux/miniconda3/envs/Keras-base/bin/python">
7
+ <settings>
8
+ <list>
9
+ <mapping local-root="$PROJECT_DIR$" remote-root="/home/bureaux/Projects/NamedEntityRecognization" />
10
+ </list>
11
+ </settings>
12
+ </remote-mappings>
13
+ </list>
14
+ </list>
15
+ </component>
16
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/webServers.xml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="WebServers">
4
+ <option name="servers">
5
+ <webServer id="fb160272-0942-419e-87dd-a353536a93b5" name="NamedEntityRecognization">
6
+ <fileTransfer accessType="SFTP" host="180.169.131.147" port="22" sshConfigId="03272ad8-3c65-4cd1-95f0-0886d605abb3" sshConfig="bureaux@180.169.131.147:22 password">
7
+ <advancedOptions>
8
+ <advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
9
+ </advancedOptions>
10
+ </fileTransfer>
11
+ </webServer>
12
+ </option>
13
+ </component>
14
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/.idea/workspace.xml ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="626a281e-0f78-4eb9-9469-6e0d7f35140d" name="变更" comment="">
8
+ <change afterPath="$PROJECT_DIR$/report/crf_trans_yidu_visual.xlsx" afterDir="false" />
9
+ <change beforePath="$PROJECT_DIR$/.idea/NamedEntityRecognization.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/NamedEntityRecognization.iml" afterDir="false" />
10
+ <change beforePath="$PROJECT_DIR$/.idea/deployment.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/deployment.xml" afterDir="false" />
11
+ <change beforePath="$PROJECT_DIR$/.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/misc.xml" afterDir="false" />
12
+ <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
13
+ <change beforePath="$PROJECT_DIR$/config.py" beforeDir="false" afterPath="$PROJECT_DIR$/config.py" afterDir="false" />
14
+ <change beforePath="$PROJECT_DIR$/path.py" beforeDir="false" afterPath="$PROJECT_DIR$/path.py" afterDir="false" />
15
+ <change beforePath="$PROJECT_DIR$/preprocess.py" beforeDir="false" afterPath="$PROJECT_DIR$/preprocess.py" afterDir="false" />
16
+ <change beforePath="$PROJECT_DIR$/train.py" beforeDir="false" afterPath="$PROJECT_DIR$/train.py" afterDir="false" />
17
+ <change beforePath="$PROJECT_DIR$/utils/snippets.py" beforeDir="false" afterPath="$PROJECT_DIR$/utils/snippets.py" afterDir="false" />
18
+ </list>
19
+ <option name="SHOW_DIALOG" value="false" />
20
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
21
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
22
+ <option name="LAST_RESOLUTION" value="IGNORE" />
23
+ </component>
24
+ <component name="Git.Settings">
25
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
26
+ </component>
27
+ <component name="MarkdownSettingsMigration">
28
+ <option name="stateVersion" value="1" />
29
+ </component>
30
+ <component name="ProjectId" id="27LFq9lTgR3bspJNCu2Zpj2aqJy" />
31
+ <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
32
+ <component name="ProjectViewState">
33
+ <option name="hideEmptyMiddlePackages" value="true" />
34
+ <option name="showLibraryContents" value="true" />
35
+ </component>
36
+ <component name="PropertiesComponent">{
37
+ &quot;keyToString&quot;: {
38
+ &quot;WebServerToolWindowFactoryState&quot;: &quot;true&quot;,
39
+ &quot;last_opened_file_path&quot;: &quot;/Volumes/Riesling/TRAIN/AI-base/src/NamedEntityRecognization/report&quot;,
40
+ &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
41
+ &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
42
+ &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
43
+ &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
44
+ &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
45
+ &quot;settings.editor.selected.configurable&quot;: &quot;com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable&quot;
46
+ }
47
+ }</component>
48
+ <component name="RecentsManager">
49
+ <key name="CopyFile.RECENT_KEYS">
50
+ <recent name="$PROJECT_DIR$/report" />
51
+ <recent name="$PROJECT_DIR$/data" />
52
+ <recent name="$PROJECT_DIR$" />
53
+ </key>
54
+ </component>
55
+ <component name="RunManager" selected="Python.predict">
56
+ <configuration name="evaluate" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
57
+ <module name="NamedEntityRecognization" />
58
+ <option name="INTERPRETER_OPTIONS" value="" />
59
+ <option name="PARENT_ENVS" value="true" />
60
+ <envs>
61
+ <env name="PYTHONUNBUFFERED" value="1" />
62
+ </envs>
63
+ <option name="SDK_HOME" value="" />
64
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
65
+ <option name="IS_MODULE_SDK" value="true" />
66
+ <option name="ADD_CONTENT_ROOTS" value="true" />
67
+ <option name="ADD_SOURCE_ROOTS" value="true" />
68
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
69
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/evaluate.py" />
70
+ <option name="PARAMETERS" value="" />
71
+ <option name="SHOW_COMMAND_LINE" value="false" />
72
+ <option name="EMULATE_TERMINAL" value="false" />
73
+ <option name="MODULE_MODE" value="false" />
74
+ <option name="REDIRECT_INPUT" value="false" />
75
+ <option name="INPUT_FILE" value="" />
76
+ <method v="2" />
77
+ </configuration>
78
+ <configuration name="model" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
79
+ <module name="NamedEntityRecognization" />
80
+ <option name="INTERPRETER_OPTIONS" value="" />
81
+ <option name="PARENT_ENVS" value="true" />
82
+ <envs>
83
+ <env name="PYTHONUNBUFFERED" value="1" />
84
+ </envs>
85
+ <option name="SDK_HOME" value="" />
86
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
87
+ <option name="IS_MODULE_SDK" value="true" />
88
+ <option name="ADD_CONTENT_ROOTS" value="true" />
89
+ <option name="ADD_SOURCE_ROOTS" value="true" />
90
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
91
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/model.py" />
92
+ <option name="PARAMETERS" value="" />
93
+ <option name="SHOW_COMMAND_LINE" value="false" />
94
+ <option name="EMULATE_TERMINAL" value="false" />
95
+ <option name="MODULE_MODE" value="false" />
96
+ <option name="REDIRECT_INPUT" value="false" />
97
+ <option name="INPUT_FILE" value="" />
98
+ <method v="2" />
99
+ </configuration>
100
+ <configuration name="predict" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
101
+ <module name="NamedEntityRecognization" />
102
+ <option name="INTERPRETER_OPTIONS" value="" />
103
+ <option name="PARENT_ENVS" value="true" />
104
+ <envs>
105
+ <env name="PYTHONUNBUFFERED" value="1" />
106
+ </envs>
107
+ <option name="SDK_HOME" value="" />
108
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
109
+ <option name="IS_MODULE_SDK" value="true" />
110
+ <option name="ADD_CONTENT_ROOTS" value="true" />
111
+ <option name="ADD_SOURCE_ROOTS" value="true" />
112
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
113
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/predict.py" />
114
+ <option name="PARAMETERS" value="" />
115
+ <option name="SHOW_COMMAND_LINE" value="false" />
116
+ <option name="EMULATE_TERMINAL" value="false" />
117
+ <option name="MODULE_MODE" value="false" />
118
+ <option name="REDIRECT_INPUT" value="false" />
119
+ <option name="INPUT_FILE" value="" />
120
+ <method v="2" />
121
+ </configuration>
122
+ <configuration name="statistic" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
123
+ <module name="NamedEntityRecognization" />
124
+ <option name="INTERPRETER_OPTIONS" value="" />
125
+ <option name="PARENT_ENVS" value="true" />
126
+ <envs>
127
+ <env name="PYTHONUNBUFFERED" value="1" />
128
+ </envs>
129
+ <option name="SDK_HOME" value="" />
130
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
131
+ <option name="IS_MODULE_SDK" value="true" />
132
+ <option name="ADD_CONTENT_ROOTS" value="true" />
133
+ <option name="ADD_SOURCE_ROOTS" value="true" />
134
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
135
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/statistic.py" />
136
+ <option name="PARAMETERS" value="" />
137
+ <option name="SHOW_COMMAND_LINE" value="false" />
138
+ <option name="EMULATE_TERMINAL" value="false" />
139
+ <option name="MODULE_MODE" value="false" />
140
+ <option name="REDIRECT_INPUT" value="false" />
141
+ <option name="INPUT_FILE" value="" />
142
+ <method v="2" />
143
+ </configuration>
144
+ <configuration name="train" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
145
+ <module name="NamedEntityRecognization" />
146
+ <option name="INTERPRETER_OPTIONS" value="" />
147
+ <option name="PARENT_ENVS" value="true" />
148
+ <envs>
149
+ <env name="PYTHONUNBUFFERED" value="1" />
150
+ </envs>
151
+ <option name="SDK_HOME" value="" />
152
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
153
+ <option name="IS_MODULE_SDK" value="true" />
154
+ <option name="ADD_CONTENT_ROOTS" value="true" />
155
+ <option name="ADD_SOURCE_ROOTS" value="true" />
156
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
157
+ <EXTENSION ID="net.ashald.envfile">
158
+ <option name="IS_ENABLED" value="false" />
159
+ <option name="IS_SUBST" value="false" />
160
+ <option name="IS_PATH_MACRO_SUPPORTED" value="false" />
161
+ <option name="IS_IGNORE_MISSING_FILES" value="false" />
162
+ <option name="IS_ENABLE_EXPERIMENTAL_INTEGRATIONS" value="false" />
163
+ <ENTRIES>
164
+ <ENTRY IS_ENABLED="true" PARSER="runconfig" />
165
+ </ENTRIES>
166
+ </EXTENSION>
167
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/train.py" />
168
+ <option name="PARAMETERS" value="" />
169
+ <option name="SHOW_COMMAND_LINE" value="true" />
170
+ <option name="EMULATE_TERMINAL" value="false" />
171
+ <option name="MODULE_MODE" value="false" />
172
+ <option name="REDIRECT_INPUT" value="false" />
173
+ <option name="INPUT_FILE" value="" />
174
+ <method v="2" />
175
+ </configuration>
176
+ <list>
177
+ <item itemvalue="Python.predict" />
178
+ <item itemvalue="Python.statistic" />
179
+ <item itemvalue="Python.train" />
180
+ <item itemvalue="Python.model" />
181
+ <item itemvalue="Python.evaluate" />
182
+ </list>
183
+ <recent_temporary>
184
+ <list>
185
+ <item itemvalue="Python.predict" />
186
+ <item itemvalue="Python.train" />
187
+ <item itemvalue="Python.evaluate" />
188
+ <item itemvalue="Python.model" />
189
+ <item itemvalue="Python.statistic" />
190
+ </list>
191
+ </recent_temporary>
192
+ </component>
193
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="应用程序级" UseSingleDictionary="true" transferred="true" />
194
+ <component name="TaskManager">
195
+ <task active="true" id="Default" summary="默认任务">
196
+ <changelist id="626a281e-0f78-4eb9-9469-6e0d7f35140d" name="变更" comment="" />
197
+ <created>1649091649915</created>
198
+ <option name="number" value="Default" />
199
+ <option name="presentableId" value="Default" />
200
+ <updated>1649091649915</updated>
201
+ <workItem from="1649091655927" duration="9332000" />
202
+ <workItem from="1649773940694" duration="6925000" />
203
+ <workItem from="1651504862776" duration="153000" />
204
+ <workItem from="1651924741385" duration="694000" />
205
+ <workItem from="1658891597769" duration="13145000" />
206
+ <workItem from="1661422884262" duration="704000" />
207
+ </task>
208
+ <servers />
209
+ </component>
210
+ <component name="TypeScriptGeneratedFilesManager">
211
+ <option name="version" value="3" />
212
+ </component>
213
+ <component name="XDebuggerManager">
214
+ <breakpoint-manager>
215
+ <breakpoints>
216
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
217
+ <url>file://$PROJECT_DIR$/predict.py</url>
218
+ <line>36</line>
219
+ <option name="timeStamp" value="5" />
220
+ </line-breakpoint>
221
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
222
+ <url>file://$PROJECT_DIR$/train.py</url>
223
+ <line>110</line>
224
+ <option name="timeStamp" value="20" />
225
+ </line-breakpoint>
226
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
227
+ <url>file://$PROJECT_DIR$/preprocess.py</url>
228
+ <line>81</line>
229
+ <option name="timeStamp" value="21" />
230
+ </line-breakpoint>
231
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
232
+ <url>file://$PROJECT_DIR$/preprocess.py</url>
233
+ <line>107</line>
234
+ <option name="timeStamp" value="24" />
235
+ </line-breakpoint>
236
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
237
+ <url>file://$PROJECT_DIR$/utils/snippets.py</url>
238
+ <line>509</line>
239
+ <option name="timeStamp" value="28" />
240
+ </line-breakpoint>
241
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
242
+ <url>file://$PROJECT_DIR$/utils/snippets.py</url>
243
+ <line>506</line>
244
+ <option name="timeStamp" value="30" />
245
+ </line-breakpoint>
246
+ </breakpoints>
247
+ </breakpoint-manager>
248
+ </component>
249
+ <component name="com.intellij.coverage.CoverageDataManagerImpl">
250
+ <SUITE FILE_PATH="coverage/NamedEntityRecognization$train.coverage" NAME="train 覆盖结果" MODIFIED="1658905350571" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
251
+ <SUITE FILE_PATH="coverage/NamedEntityRecognization$model.coverage" NAME="model 覆盖结果" MODIFIED="1649776894188" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
252
+ <SUITE FILE_PATH="coverage/NamedEntityRecognization$evaluate.coverage" NAME="evaluate 覆盖结果" MODIFIED="1649825507637" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
253
+ <SUITE FILE_PATH="coverage/NamedEntityRecognization$statistic.coverage" NAME="statistic 覆盖结果" MODIFIED="1649172187190" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
254
+ <SUITE FILE_PATH="coverage/NamedEntityRecognization$predict.coverage" NAME="predict 覆盖结果" MODIFIED="1658911968974" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
255
+ </component>
256
+ </project>
chinese_medical_ner/ccksyidu4k-ner-roformer/README.md ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CCKS2019医渡云4k电子病历数据集命名实体识别
2
+
3
+ ## Dataset
4
+
5
+ Yidu-S4K数据集,对于给定的一组电子病历纯文本文档,任务的目标是识别并抽取出与医学临床相关的实体提及(entity mention),并将它们归类到预定义类别(pre-defined
6
+ categories),比如疾病、治疗、检查检验等。
7
+
8
+ 1. 疾病和诊断:医学上定义的疾病和医生在临床工作中对病因、病生理、分型分期等所作的判断。
9
+ 2. 检查: 影像检查(X线、CT、MR、PETCT等)+造影+超声+心电图,未避免检查操作与手术操作过多冲突,不包含此外其它的诊断性操作,如胃镜、肠镜等。
10
+ 3. 检验: 在实验室进行的物理或化学检查,本期特指临床工作中检验科进行的化验,不含免疫组化等广义实验室检查
11
+ 4. 手术: 医生在患者身体局部进行的切除、缝合等治疗,是外科的主要治疗方法。
12
+ 5. 药物: 用于疾病治疗的具体化学物质。
13
+ 6. 解剖部位: 指疾病、症状和体征发生的人体解剖学部位。
14
+
15
+ 任务一数据结构: 任务一数据每一行为一个json json key 为`['originalText','entities']` 即原文和实体列表 `json["entities"]`
16
+ 为列表,每个元素代表一个实体entity,其中有该实体在原文中的起始位置`start_pos`,结束位置`end_pos`,以及实体类型
17
+
18
+ 训练样本1000条,提交的测试样本379条,经过处理后转成BIO格式,形如:
19
+
20
+ ```
21
+ 心 B-TESTIMAGE
22
+ 脏 I-TESTIMAGE
23
+ 彩 I-TESTIMAGE
24
+ 超 I-TESTIMAGE
25
+ : O
26
+ 右 B-ANATOMY
27
+ 房 I-ANATOMY
28
+ 、 O
29
+ 右 B-ANATOMY
30
+ 室 I-ANATOMY
31
+ 稍 O
32
+ 增 O
33
+ 大 O
34
+ , O
35
+ E B-TESTLAB
36
+ F I-TESTLAB
37
+ 正 O
38
+ 常 O
39
+ 。 O
40
+ ```
41
+
42
+ ATTENTION:
43
+
44
+ - 字与标签之间用tab("\t")隔开
45
+ - 其中句子与句子之间使用空行隔开
46
+ - 文件最后以两个换行结束
47
+
48
+ 句长与数量信息可以运行`statistic.py`以查看
49
+
50
+ ## Project Structure
51
+
52
+ ```
53
+ ./
54
+ ├── README.md
55
+ ├── __pycache__
56
+ ├── chinese_roformer-v2-char_L-12_H-768_A-12 roformer_v2 base权重文件
57
+ │   ├── bert_config.json
58
+ │   ├── bert_model.ckpt.data-00000-of-00001
59
+ │   ├── bert_model.ckpt.index
60
+ │   ├── bert_model.ckpt.meta
61
+ │   ├── checkpoint
62
+ │   └── vocab.txt
63
+ ├── chinese_roformer-v2-char_L-6_H-384_A-6 roformer_v2 small 权重文件
64
+ │   ├── bert_config.json
65
+ │   ├── bert_model.ckpt.data-00000-of-00001
66
+ │   ├── bert_model.ckpt.index
67
+ │   ├── bert_model.ckpt.meta
68
+ │   ├── checkpoint
69
+ │   └── vocab.txt
70
+ ├── config.py 模型可能需要调整的超参数
71
+ ├── data 数据集文件夹
72
+ │   ├── yidu.test 官方提供的379个测试样本
73
+ │   ├── yidu.train 从划分官方1000个训练样本中划分的的训练集
74
+ │   ├── yidu.validate 从划分官方1000个训练样本中划分的的验证集
75
+ │   └── yidu_catagory.pkl 类别set,由train.py生成,predict.py中用到
76
+ ├── evaluate.py
77
+ ├── images 训练、评估数据生成的图片
78
+ │   ├── train_acc.png
79
+ │   ├── train_loss.png
80
+ │   └── val_f1.png
81
+ ├── log 训练日志,由train.py生成
82
+ │   ├── train_loss.csv
83
+ │   ├── val_f1.csv
84
+ │   ├── yidu.out
85
+ │   └── yidu_f1.out
86
+ ├── model.py 构建模型
87
+ ├── path.py 所有路径
88
+ ├── predict.py 模型预测输出
89
+ ├── preprocess.py 数据预处理
90
+ ├── statistic.py 统计句长与数量信息,以便调整和设置maxlen
91
+ ├── report 评估报告,由evaluate.py生成
92
+ │   └── yidu_bert_base.csv 每个类别的精准、召回、F1
93
+ ├── train.py 训练文件
94
+ ├── requirements.txt pip环境
95
+ ├── plot.py 画图工具
96
+ ├── utils bert4keras工具包,也可pip下载
97
+ │   ├── __init__.py
98
+ │   ├── __pycache__
99
+ │   ├── backend.py
100
+ │   ├── layers.py
101
+ │   ├── models.py
102
+ │   ├── optimizers.py
103
+ │   ├── snippets.py
104
+ │   └── tokenizers.py
105
+ └── weights 保存的权重
106
+ ├── yidu_catagory.pkl 实体类别
107
+ ├── yidu_roformer_v2_base.h5 模型权重
108
+ └── yidu_roformer_v2_crf_trans.pkl 最佳模型的权重
109
+ ```
110
+
111
+ ## Requirements
112
+
113
+ ```
114
+ Keras==2.2.4
115
+ matplotlib==3.4.0
116
+ pandas==1.2.3
117
+ tensorflow==1.14.0
118
+ tqdm==4.61.2
119
+ ```
120
+
121
+ ## Steps
122
+
123
+ 1. 替换数据集
124
+ 2. 修改path.py中的地址
125
+ 3. 删掉旧的weights/{}_catagory.pkl类别set文件
126
+ 4. 根据需要修改model.py模型结构
127
+ 5. 修改config.py的参数
128
+ 6. Debug
129
+ 7. 训练
130
+
131
+ ## Model
132
+
133
+ ### 上游
134
+
135
+ [GitHub - ZhuiyiTechnology/roformer-v2: RoFormer升级版](https://github.com/ZhuiyiTechnology/roformer-v2)
136
+ 是RoFormer升级版,主要通过结构的简化来提升速度,并通过无监督预训练和有监督预训练的结合来提升效果,从而达到了速度与效果的“双赢”。相比RoFormer,RoFormerV2的主要改动是简化模型结构、增加训练数据以及加入有监督训练,这些改动能让RoFormerV2最终取得了速度和效果的“双赢”。
137
+
138
+ - **Small版**
139
+ : [chinese_roformer-v2-char_L-6_H-384_A-6.zip](https://open.zhuiyi.ai/releases/nlp/models/zhuiyi/chinese_roformer-v2-char_L-6_H-384_A-6.zip)
140
+ - **Base版**
141
+ : [chinese_roformer-v2-char_L-12_H-768_A-12.zip](https://open.zhuiyi.ai/releases/nlp/models/zhuiyi/chinese_roformer-v2-char_L-12_H-768_A-12.zip)
142
+ - **Large版**
143
+ : [chinese_roformer-v2-char_L-24_H-1024_A-16.zip](https://open.zhuiyi.ai/releases/nlp/models/zhuiyi/chinese_roformer-v2-char_L-24_H-1024_A-16.zip)
144
+
145
+ ### 下游
146
+
147
+ ![](images/downstream.png)
148
+
149
+ 模型大小
150
+
151
+ > * **Small版**:两张3090(24G),先用无监督MLM训练了100万步(maxlen为512),然后有监督多任务训练了75万步(maxlen从64到512不等,取决于任务),batch_size为512,优化器为LAMB;
152
+ > * **Base版**:四张3090(24G),先用无监督MLM训练了100万步(maxlen为512),然后有监督多任务训练了75万步(maxlen从64到512不等,取决于任务),batch_size为512,优化器为LAMB;
153
+ > * **Large版**:两张A100(80G),先用无监督MLM训练了100万步(maxlen为512),然后有监督多任务训练了50万步(maxlen从64到512不等,取决于任务),batch_size为512,优化器为LAMB。
154
+
155
+ ## Config
156
+
157
+ - `maxlen` 训练中每个batch的最大单句长度,少于填充,多于截断
158
+ - `epochs` 最大训练轮次
159
+ - `batch_size` batch size
160
+ - `bert_layers` bert层数,small ≤ 4,base ≤ 12
161
+ - `crf_lr_multiplier` CRF层放大的学习率,必要时扩大它
162
+ - `model_type` 模型, 'roformer_v2'
163
+ - `dropout_rate` dropout比率
164
+ - `max_lr` 最大学习率,bert_layers越大应该越小,small建议5e-5~1e-4,base建议1e-5~5e-5
165
+ - `lstm_hidden_units` lstm隐藏层数量
166
+
167
+ ATTENTION: 并非所有句子都要填充到同一个长度,要求每个batch内的每个样本长度一致即可。所以若batch中最大长度 ≤ maxlen,则该batch将填充or截断到最长句子长度,若batch中最大长度 ≥
168
+ maxlen,则该batch将填充or截断到config.py中的maxlen
169
+
170
+ ## Train
171
+
172
+ ### 策略
173
+
174
+ #### 划分策略
175
+
176
+ 将1000条训练样本按8:2划分成训练集、验证集,并shuffle。
177
+
178
+ #### 优化策略
179
+
180
+ - 使用EMA(exponential mobing average)滑动平均配合Adam作为优化策略。滑动平均可以用来估计变量的局部值,是的变量的更新与一段时间内的历史值有关。它的意义在于利用滑动平均的参数来提高模型在测试数据上的健壮性。
181
+ EMA 对每一个待更新训练学习的变量 (variable) 都会维护一个影子变量 (shadow variable)。影子变量的初始值就是这个变量的初始值。
182
+ - BERT模型由于已经有了预训练权重,所以微调权重只需要很小的学习率,而LSTM和Dense使用的`he_normal`
183
+ 初始化学习率,需要使用较大学习率,所以本模型使用[分层学习率](https://kexue.fm/archives/6418)
184
+ - 在Embedding层注入扰动,[对抗训练](https://kexue.fm/archives/7234) ,使模型更具鲁棒性。
185
+
186
+ #### 停止策略
187
+
188
+ 在callback中计算验证集实体F1值,监控它。5轮不升即停。
189
+
190
+ ### 日志
191
+
192
+ ```
193
+ Epoch 1/999
194
+ 78/78 [==============================] - 342s 4s/step - loss: 44.7248 - sparse_accuracy: 0.8038
195
+ valid: f1: 0.05063, precision: 0.06611, recall: 0.04103, best f1: 0.05063
196
+ Epoch 2/999
197
+ 78/78 [==============================] - 313s 4s/step - loss: 13.2246 - sparse_accuracy: 0.9135
198
+ valid: f1: 0.67956, precision: 0.70216, recall: 0.65837, best f1: 0.67956
199
+ Epoch 3/999
200
+ 78/78 [==============================] - 319s 4s/step - loss: 5.9724 - sparse_accuracy: 0.9418
201
+ valid: f1: 0.81794, precision: 0.83338, recall: 0.80306, best f1: 0.81794
202
+
203
+ ...
204
+
205
+ Epoch 16/999
206
+ 78/78 [==============================] - 308s 4s/step - loss: 1.6843 - sparse_accuracy: 0.9109
207
+ Early stop count 3/5
208
+ valid: f1: 0.87578, precision: 0.86848, recall: 0.88321, best f1: 0.87753
209
+ Epoch 17/999
210
+ 78/78 [==============================] - 323s 4s/step - loss: 1.5966 - sparse_accuracy: 0.9090
211
+ Early stop count 4/5
212
+ valid: f1: 0.87717, precision: 0.86962, recall: 0.88485, best f1: 0.87753
213
+ Epoch 18/999
214
+ 78/78 [==============================] - 324s 4s/step - loss: 1.4774 - sparse_accuracy: 0.9092
215
+ Early stop count 5/5
216
+ Epoch 00018: early stopping THR
217
+ valid: f1: 0.87693, precision: 0.86916, recall: 0.88485, best f1: 0.87753
218
+ ```
219
+
220
+ 训练集crf loss
221
+
222
+ ![](images/yidu_train_loss.png)
223
+
224
+ 训练集crf acc:
225
+
226
+ ![](images/yidu_train_acc.png)
227
+
228
+ ### Evaluate
229
+
230
+ ### 策略
231
+
232
+ 评估策略为实体级别的F1,抽取到的每个实体的label、在每句中的起始坐标、终止坐标都正确才算对
233
+
234
+ 可以评估:
235
+
236
+ - 总的F1:所有类别一起统计,TP为所有label、起始坐标、终止坐标都正确的个数,TP+FP为预测实体总数,TP+FN为真实实体总数
237
+ - 每类的F1:分类统计,TP为每个列别的起始坐标、终止坐标都正确的个数,TP+FP为每个类别的预测实体总数,TP+FN为每个类别的真实实体总数
238
+
239
+ ### 评估单个模型
240
+
241
+ ```python
242
+ evaluate_one(save_file_path = weights_path + '/yidu_roformer_v2_base.h5',
243
+ dataset_path = "./data/yidu.test",
244
+ csv_path = './report/yidu_bert_base.csv',
245
+ evaluate_categories_f1 = True)
246
+ ```
247
+
248
+ `save_file_path`,`dataset_path`是评估数据集路径,`evaluate_categories_f1`为是否评估每个类别的F1(时间会比评估总的F1长很多),`csv_path`
249
+ 是每类F1数据生成的csv文件存放路径。
250
+
251
+ ATTENTION: 1个batch只进1条句子,所以可以无视train的maxlen,但是tokenize后长于512的部分将无法被预测,也不会被算进P里
252
+
253
+ ## Performance
254
+
255
+ ### 测试集表现
256
+
257
+ ![](images/yidu_val_f1.png)
258
+
259
+ ### 验证集最佳F1
260
+
261
+ ```
262
+ Epoch 13/999
263
+ 78/78 [==============================] - 314s 4s/step - loss: 1.9135 - sparse_accuracy: 0.9114
264
+ valid: f1: 0.87753, precision: 0.87033, recall: 0.88485, best f1: 0.87753
265
+ ```
266
+
267
+ ### 官方提供的379条测试样本表现
268
+
269
+ ```
270
+ weight path:/home/bureaux/Projects/NamedEntityRecognization/weights/yidu_roformer_v2_base.h5
271
+ evaluate dataset path:./data/yidu.test
272
+ Evaluating General F1: 100%|████████████████████████████████████| 2035/2035 [03:11<00:00, 10.60it/s]
273
+ General: f1: 0.87700, precision: 0.86014, recall: 0.89454
274
+ ```
275
+
276
+ ### 官方提供的379条测试样本表现每的类别F1评测结果
277
+
278
+ ```
279
+ Evaluating F1 of each Categories: 100%|█████████████████████████| 2035/2035 [19:37<00:00, 1.73it/s]
280
+ TP TP+FP TP+FN precision recall f1
281
+ ANATOMY 2788 3286 3094 0.8484 0.9011 0.8740
282
+ DISEASE 1176 1332 1323 0.8829 0.8889 0.8859
283
+ DRUG 470 497 485 0.9457 0.9691 0.9572
284
+ OPERATION 143 158 162 0.9051 0.8827 0.8938
285
+ TESTIMAGE 326 366 348 0.8907 0.9368 0.9132
286
+ TESTLAB 466 603 590 0.7728 0.7898 0.7812
287
+ ```
288
+
289
+ ## Predict
290
+
291
+ ```python
292
+ txt = '1997-8-6行胃癌根治术,2010.11发现CA724 升高最高1295 ,复查PET-CT检查未见复发转移,之后多次复查CA724 波动在500-800之间,多次查胃镜提示吻合口炎,给予对症治疗,患者感左下腹隐痛下腹隐痛不适,2013.10.15复查血CA724 147 CA199 13.62 ,2013.10.23复查腹部CT检查提示胰腺占位,考虑恶性,胰头周围,肝门,腹膜后多发多发淋巴结转移。PET-CT提示:胰头区高代谢,考虑恶性病变。患者近10天出现午饭后左下腹部胀痛,持续2-3小时候可自行缓解。体重近1月上降2KG.患者胰腺穿刺取病理示低分化腺癌,免疫组化示CEA+,CGA+/-,CD56+/-,SYN+/-,对手术有顾虑,且手术风险较大,2013-11-26行放疗30次,2014-1-7放疗结束。2013-11-28始行单药吉西他滨化疗4周期。末次2014-1-7.放化疗中出现黄疸,对症治疗后好转。化疗后患者出现II度白细胞降低、II度血小板降低。2014-1-24复查胰头区病灶及腹腔淋巴结均较强缩小,胰腺穿刺病理中低分化腺癌,免疫组化CA19+,CK7+,CGA-,SYN-,CD56-,CA199+,符合胆、胰导管来源浸润性腺癌。CA72.4 明显上降。2014-1-27病理比对原胃切除标本报告与胰腺肿瘤存在较大形态差异。考虑患者明确胰腺癌,于2014-2-7行第5周期GEM化疗,2014-2复查后病灶缩小SD,于2014-2-21开始第六周期化疗,因第八天白细胞减少推迟到2014-3-3。2014-4-7第8周期化疗。末次给药2014-4-14.2014-4-21复查评效SD,略有缩小,CA72.4降低至11.12.2014-4-28继续单药GEM化疗,末次给药时间2014-9-1.GEM双周一次,2014-7-24复查胰腺病灶继续缩小,评效PR。现患者无明显不适,饮食、睡眠可,体重较前上降约4KG。'
293
+ for i in predict(txt = txt,
294
+ weights_path = weights_path + '/yidu_roformer_v2_base.h5',
295
+ label_dict_path = label_dict_path,
296
+ trans_path = "./weights/yidu_roformer_v2_crf_trans.pkl"):
297
+ print(i)
298
+ ```
299
+
300
+ txt为输入文本,save_file_path为使用权重的路径,label_dict_path为实体类别字典的pkl文件,trans_path为模型转移矩阵文件。缺一不可。
301
+
302
+ 输出结果
303
+
304
+ ```
305
+ [
306
+ ('胃癌根治术', 'OPERATION', 9, 13)
307
+ ('CA724', 'TESTLAB', 24, 28)
308
+ ('PET-CT', 'TESTIMAGE', 42, 47)
309
+ ('CA724', 'TESTLAB', 63, 67)
310
+ ('吻合口炎', 'DISEASE', 89, 92)
311
+ ('左下腹', 'ANATOMY', 104, 106)
312
+ ('下腹', 'ANATOMY', 109, 110)
313
+ ('CA724', 'TESTLAB', 129, 133)
314
+ ('腹部CT', 'TESTIMAGE', 164, 167)
315
+ ('胰腺', 'ANATOMY', 172, 173)
316
+ ('胰头', 'ANATOMY', 182, 183)
317
+ ('肝门', 'ANATOMY', 187, 188)
318
+ ('腹膜', 'ANATOMY', 190, 191)
319
+ ('PET-CT', 'TESTIMAGE', 203, 208)
320
+ ('胰头区', 'ANATOMY', 212, 214)
321
+ ('左下腹部', 'ANATOMY', 237, 240)
322
+ ('胰腺', 'ANATOMY', 271, 272)
323
+ ('低分化腺癌', 'DISEASE', 279, 283)
324
+ ('吉西他滨', 'DRUG', 376, 379)
325
+ ('白细胞', 'TESTLAB', 424, 426)
326
+ ('血小板', 'TESTLAB', 433, 435)
327
+ ('胰头区', 'ANATOMY', 450, 452)
328
+ ('腹腔淋巴结', 'ANATOMY', 456, 460)
329
+ ('胰腺', 'ANATOMY', 467, 468)
330
+ ('中低分化腺癌', 'DISEASE', 473, 478)
331
+ ('胆', 'ANATOMY', 520, 520)
332
+ ('胰', 'ANATOMY', 522, 522)
333
+ ('CA72.4', 'TESTLAB', 533, 538)
334
+ ('胃', 'ANATOMY', 559, 559)
335
+ ('胰腺肿瘤', 'DISEASE', 567, 570)
336
+ ('胰腺癌', 'DISEASE', 586, 588)
337
+ ]
338
+ ```
339
+
340
+ 输出格式为`(实体, 类别, 起始坐标, 终止坐标)`
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/config.cpython-310.pyc ADDED
Binary file (419 Bytes). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/config.cpython-37.pyc ADDED
Binary file (413 Bytes). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/config.cpython-38.pyc ADDED
Binary file (407 Bytes). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/evaluate.cpython-310.pyc ADDED
Binary file (4.57 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/evaluate.cpython-37.pyc ADDED
Binary file (4.65 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/evaluate.cpython-38.pyc ADDED
Binary file (4.59 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/model.cpython-310.pyc ADDED
Binary file (4.28 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/model.cpython-37.pyc ADDED
Binary file (4.18 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/model.cpython-38.pyc ADDED
Binary file (4.23 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/path.cpython-310.pyc ADDED
Binary file (976 Bytes). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/path.cpython-37.pyc ADDED
Binary file (970 Bytes). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/path.cpython-38.pyc ADDED
Binary file (964 Bytes). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/plot.cpython-310.pyc ADDED
Binary file (1.62 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/plot.cpython-37.pyc ADDED
Binary file (1.68 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/plot.cpython-38.pyc ADDED
Binary file (1.61 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/predict.cpython-37.pyc ADDED
Binary file (2.58 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/preprocess.cpython-310.pyc ADDED
Binary file (4.22 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/preprocess.cpython-37.pyc ADDED
Binary file (4.18 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/__pycache__/preprocess.cpython-38.pyc ADDED
Binary file (4.18 kB). View file
 
chinese_medical_ner/ccksyidu4k-ner-roformer/calc_bert_matrix.ipynb ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
13
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
14
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
15
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
16
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
17
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
18
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
19
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
20
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
21
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
22
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
23
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
24
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
25
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
26
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
27
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
28
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
29
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
30
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
31
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
32
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
33
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
34
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
35
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
36
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
37
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
38
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
39
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
40
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
41
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
42
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
43
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
44
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
45
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
46
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
47
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
48
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
49
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
50
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
51
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
52
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
53
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
54
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
55
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
56
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
57
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
58
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
59
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
60
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
61
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
62
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
63
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n"
64
+ ]
65
+ },
66
+ {
67
+ "name": "stdout",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "12\n",
71
+ "28\n",
72
+ "28.835511207580566\n",
73
+ "[[0.8042815 0.83821416 0.7857758 0.8936385 0.7276579 0.83560634\n",
74
+ " 0.83956754 0.7425114 0.82972634 0.840919 0.8455287 0.8671753\n",
75
+ " 0.7728379 0.43285608 0.82833314 0.8327997 0.8392484 0.8234416\n",
76
+ " 0.8522128 0.51038194 0.82206476 0.7454972 0.8382132 0.49966788\n",
77
+ " 0.86459064 0.7834512 0.8475671 0.85460234]\n",
78
+ " [0.8011063 0.828354 0.82743424 0.850899 0.73062104 0.85595804\n",
79
+ " 0.8435649 0.7553144 0.8425723 0.82148576 0.80054176 0.89214945\n",
80
+ " 0.79418015 0.47419527 0.81845486 0.8461245 0.8021023 0.7938319\n",
81
+ " 0.79460996 0.5338131 0.87848425 0.7716693 0.8170972 0.52933466\n",
82
+ " 0.8097694 0.83968496 0.839522 0.8165166 ]\n",
83
+ " [0.4245787 0.4378011 0.42134485 0.46912104 0.38410604 0.44984287\n",
84
+ " 0.41388378 0.44095236 0.43873137 0.44801378 0.43414456 0.4509009\n",
85
+ " 0.41366065 0.7984361 0.4289 0.43039462 0.42809123 0.4324836\n",
86
+ " 0.46127015 0.86037296 0.41747275 0.38431278 0.48179275 0.84873366\n",
87
+ " 0.4345677 0.41943404 0.46897653 0.45358443]\n",
88
+ " [0.7554055 0.77900416 0.7624301 0.8424594 0.68307996 0.8088174\n",
89
+ " 0.80660224 0.69681954 0.7785535 0.8220203 0.79812443 0.8501669\n",
90
+ " 0.7326208 0.41718763 0.7723533 0.8132994 0.8087872 0.77721477\n",
91
+ " 0.7891983 0.46248394 0.7991282 0.73404676 0.81659716 0.46346214\n",
92
+ " 0.79148304 0.7274809 0.9603679 0.77111566]\n",
93
+ " [0.8101381 0.8370694 0.8437565 0.87504846 0.73189175 0.86311483\n",
94
+ " 0.8619976 0.79309046 0.8413706 0.8296794 0.8228364 0.99999994\n",
95
+ " 0.8147346 0.4384241 0.81975913 0.8577111 0.8390564 0.8067612\n",
96
+ " 0.8274136 0.51204675 0.88608086 0.7762571 0.8515235 0.50785893\n",
97
+ " 0.80906993 0.8036982 0.87490416 0.8234324 ]\n",
98
+ " [0.4217796 0.48954895 0.42723012 0.4532832 0.3561649 0.4448802\n",
99
+ " 0.4336366 0.45388544 0.4319604 0.46770507 0.41890997 0.44228512\n",
100
+ " 0.43652567 0.93544704 0.446108 0.46484137 0.39359793 0.39574915\n",
101
+ " 0.45182198 0.8406079 0.425097 0.39100745 0.47122467 0.8352574\n",
102
+ " 0.42255652 0.4323899 0.4527101 0.43198568]\n",
103
+ " [0.8338187 0.8482033 0.7583538 0.9017825 0.7151871 0.84789246\n",
104
+ " 0.8150497 0.7093185 0.8569419 0.8142565 0.899078 0.84663707\n",
105
+ " 0.7619577 0.44392824 0.79649574 0.80953574 0.8414211 0.8342018\n",
106
+ " 0.80380815 0.4652272 0.83020467 0.75900805 0.81513274 0.4604773\n",
107
+ " 0.8724065 0.79225063 0.8495691 0.8571184 ]\n",
108
+ " [0.4388544 0.48099732 0.44652414 0.491911 0.39358306 0.4963931\n",
109
+ " 0.46961203 0.4602445 0.45970094 0.49297816 0.44363937 0.50785893\n",
110
+ " 0.42448643 0.8044198 0.4709897 0.47543868 0.4438693 0.4341317\n",
111
+ " 0.47560525 0.94062746 0.46269763 0.41282403 0.49911708 0.9999999\n",
112
+ " 0.45720756 0.43912742 0.51201 0.48214957]\n",
113
+ " [0.852879 0.9037154 0.7821789 0.9420587 0.7552315 0.8836415\n",
114
+ " 0.87547123 0.7251524 0.8828964 0.8373711 0.92202234 0.89002985\n",
115
+ " 0.7854445 0.474783 0.8376787 0.85402507 0.8500402 0.8268823\n",
116
+ " 0.82753396 0.48748526 0.87112916 0.78053 0.82918906 0.48343372\n",
117
+ " 0.841926 0.8271333 0.8875084 0.87484753]\n",
118
+ " [0.7869381 0.85783684 0.7684859 0.8841141 0.66779125 0.7765528\n",
119
+ " 0.75417054 0.73889744 0.85345876 0.863776 0.86478865 0.82011044\n",
120
+ " 0.77472615 0.43941003 0.7532432 0.7775699 0.7492738 0.7495041\n",
121
+ " 0.8271534 0.46770984 0.77475417 0.72683036 0.80998313 0.4553116\n",
122
+ " 0.8437807 0.75127625 0.8239754 0.8759123 ]\n",
123
+ " [0.8414567 0.86158824 0.7993734 0.9158263 0.75114155 0.8754386\n",
124
+ " 0.8565251 0.75108814 0.8627944 0.8455615 0.8663789 0.8859818\n",
125
+ " 0.7832396 0.4651299 0.8199284 0.8319515 0.8332075 0.81501603\n",
126
+ " 0.8339864 0.5201047 0.85637003 0.763462 0.82180524 0.5130591\n",
127
+ " 0.8290285 0.82059264 0.84924185 0.8875982 ]\n",
128
+ " [0.75577044 0.74085385 0.737481 0.7770459 0.708807 0.79907984\n",
129
+ " 0.80543596 0.6826918 0.718661 0.7301651 0.7104209 0.8091022\n",
130
+ " 0.710036 0.4174271 0.80161786 0.8145112 0.7708455 0.76511174\n",
131
+ " 0.74256396 0.4779269 0.79805374 0.7345556 0.75847065 0.48508406\n",
132
+ " 0.7402287 0.755322 0.8002572 0.72530735]]\n",
133
+ "0.916689\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "## calc with cpu\n",
139
+ "import time\n",
140
+ "from transformers import BertTokenizer, BertModel\n",
141
+ "import torch\n",
142
+ "import numpy as np\n",
143
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
144
+ "\n",
145
+ "# 初始化模型和分词器\n",
146
+ "tokenizer = BertTokenizer.from_pretrained(\"G:/model_zoo/LM/bert-base-chinese/\")\n",
147
+ "bert_model = BertModel.from_pretrained(\"G:/model_zoo/LM/bert-base-chinese/\")\n",
148
+ "\n",
149
+ "# tgt和out列表\n",
150
+ "# tgt_list = ['症状', '器官', '检查']\n",
151
+ "# out_list = ['病状', '身体部位', '诊断','胃']\n",
152
+ "\n",
153
+ "tgt_list = ['乏力感', '厌世', '躯体不适', '社会功能严重受损', '兴趣减退', '言行紊乱', '脑器质性疾病', '情绪低落', '精神障碍', '情绪差伴躯体不适', '焦虑', '自责']\n",
154
+ "out_list = ['认知行为治疗', '与家人交流障碍', '偶有轻生想法', '长期适应性障碍', '利培酮', '心理治疗', '发呆', '独处时感到被支配', '沉迷学佛后出现精神异常', '与家人交流困难', '急性而短暂的精神病性障碍', '兴趣减退', '有被害妄想和攻击行为', '言语紊乱', '自知力可', '攻击行为', '心脏', '肝脏', '反应慢', '持续的情绪低落', '沉迷学佛', '氢溴酸西酞普兰', '无法胜任家务', '情绪低落', '急性起病', '被害妄想', '社会功能受损', '重度抑郁发作']\n",
155
+ "print(len(tgt_list))\n",
156
+ "print(len(out_list))\n",
157
+ "\n",
158
+ "# 获取词向量\n",
159
+ "def get_word_embedding(word):\n",
160
+ " # 对单词进行编码\n",
161
+ " input_ids = tokenizer.encode(word, add_special_tokens=True, return_tensors='pt')\n",
162
+ " # 获取词向量\n",
163
+ " with torch.no_grad():\n",
164
+ " output = bert_model(input_ids)\n",
165
+ " # 使用[CLS]标记的向量作为句子向量\n",
166
+ " return output.last_hidden_state[:, 0, :].numpy()\n",
167
+ "\n",
168
+ "# 计算相似性矩阵\n",
169
+ "def calculate_similarity_matrix(words_list1, words_list2):\n",
170
+ " if len(words_list1) > 0 and len(words_list2) > 0:\n",
171
+ " embeddings1 = np.array([get_word_embedding(word) for word in words_list1])\n",
172
+ " embeddings2 = np.array([get_word_embedding(word) for word in words_list2])\n",
173
+ " \n",
174
+ " # 计算余弦相似性矩阵\n",
175
+ " similarity_matrix = cosine_similarity(embeddings1.reshape(embeddings1.shape[0],-1), embeddings2.reshape(embeddings2.shape[0],-1))\n",
176
+ " else:\n",
177
+ " similarity_matrix = np.zeros((2,2))\n",
178
+ " return similarity_matrix\n",
179
+ "\n",
180
+ "time1 = time.time()\n",
181
+ "# 计算tgt和out列表的相似性矩阵\n",
182
+ "for i in range(50):\n",
183
+ " similarity_matrix = calculate_similarity_matrix(tgt_list, out_list)\n",
184
+ "time2 = time.time()\n",
185
+ "time_cost = time2-time1\n",
186
+ "print(time_cost)\n",
187
+ "\n",
188
+ "# 打印相似性矩阵\n",
189
+ "print(similarity_matrix)\n",
190
+ "print(np.max(np.array(similarity_matrix),axis=1).mean())\n"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": null,
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "name": "stderr",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
203
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
204
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
205
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
206
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
207
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
208
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
209
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
210
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
211
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
212
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
213
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
214
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
215
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
216
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
217
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
218
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
219
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
220
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
221
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
222
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
223
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
224
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
225
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
226
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
227
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
228
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
229
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
230
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
231
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
232
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
233
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
234
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
235
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
236
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
237
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
238
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
239
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
240
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
241
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
242
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
243
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
244
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
245
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
246
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
247
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
248
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
249
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
250
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
251
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
252
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
253
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n"
254
+ ]
255
+ },
256
+ {
257
+ "name": "stdout",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "12\n",
261
+ "28\n",
262
+ "29.067501306533813\n",
263
+ "[[[-1. 1. 1. ... 1. 1. 1.]\n",
264
+ " [-1. 1. 1. ... 1. 1. 1.]\n",
265
+ " [-1. 1. -1. ... 1. -1. -1.]\n",
266
+ " ...\n",
267
+ " [ 1. 1. 1. ... 1. 1. 1.]\n",
268
+ " [-1. 1. 1. ... 1. -1. 1.]\n",
269
+ " [ 1. 1. 1. ... 1. 1. 1.]]\n",
270
+ "\n",
271
+ " [[ 1. 1. 1. ... 1. 1. 1.]\n",
272
+ " [ 1. 1. 1. ... 1. 1. 1.]\n",
273
+ " [ 1. 1. -1. ... 1. -1. -1.]\n",
274
+ " ...\n",
275
+ " [-1. 1. 1. ... 1. 1. 1.]\n",
276
+ " [ 1. 1. 1. ... 1. -1. 1.]\n",
277
+ " [-1. 1. 1. ... 1. 1. 1.]]\n",
278
+ "\n",
279
+ " [[ 1. 1. -1. ... 1. 1. 1.]\n",
280
+ " [ 1. 1. -1. ... 1. 1. 1.]\n",
281
+ " [ 1. 1. 1. ... 1. -1. -1.]\n",
282
+ " ...\n",
283
+ " [-1. 1. -1. ... 1. 1. 1.]\n",
284
+ " [ 1. 1. -1. ... 1. -1. 1.]\n",
285
+ " [-1. 1. -1. ... 1. 1. 1.]]\n",
286
+ "\n",
287
+ " ...\n",
288
+ "\n",
289
+ " [[ 1. 1. 1. ... 1. 1. -1.]\n",
290
+ " [ 1. 1. 1. ... 1. 1. -1.]\n",
291
+ " [ 1. 1. -1. ... 1. -1. 1.]\n",
292
+ " ...\n",
293
+ " [-1. 1. 1. ... 1. 1. -1.]\n",
294
+ " [ 1. 1. 1. ... 1. -1. -1.]\n",
295
+ " [-1. 1. 1. ... 1. 1. -1.]]\n",
296
+ "\n",
297
+ " [[-1. 1. 1. ... 1. 1. 1.]\n",
298
+ " [-1. 1. 1. ... 1. 1. 1.]\n",
299
+ " [-1. 1. -1. ... 1. -1. -1.]\n",
300
+ " ...\n",
301
+ " [ 1. 1. 1. ... 1. 1. 1.]\n",
302
+ " [-1. 1. 1. ... 1. -1. 1.]\n",
303
+ " [ 1. 1. 1. ... 1. 1. 1.]]\n",
304
+ "\n",
305
+ " [[-1. 1. 1. ... 1. 1. 1.]\n",
306
+ " [-1. 1. 1. ... 1. 1. 1.]\n",
307
+ " [-1. 1. -1. ... 1. -1. -1.]\n",
308
+ " ...\n",
309
+ " [ 1. 1. 1. ... 1. 1. 1.]\n",
310
+ " [-1. 1. 1. ... 1. -1. 1.]\n",
311
+ " [ 1. 1. 1. ... 1. 1. 1.]]]\n",
312
+ "0.9941406\n"
313
+ ]
314
+ }
315
+ ],
316
+ "source": [
317
+ "## calc with gpu 反而更慢了。。。\n",
318
+ "import time\n",
319
+ "\n",
320
+ "from transformers import BertTokenizer, BertModel\n",
321
+ "import torch\n",
322
+ "import numpy as np\n",
323
+ "# from sklearn.metrics.pairwise import cosine_similarity\n",
324
+ "from torch.nn.functional import cosine_similarity\n",
325
+ "\n",
326
+ "# 初始化模型和分词器\n",
327
+ "tokenizer = BertTokenizer.from_pretrained(\"G:/model_zoo/LM/bert-base-chinese/\")\n",
328
+ "bert_model = BertModel.from_pretrained(\"G:/model_zoo/LM/bert-base-chinese/\")\n",
329
+ "\n",
330
+ "# tgt和out列表\n",
331
+ "# tgt_list = ['症状', '器官', '检查']\n",
332
+ "# out_list = ['病状', '身体部位', '诊断','胃']\n",
333
+ "\n",
334
+ "tgt_list = ['乏力感', '厌世', '躯体不适', '社会功能严重受损', '兴趣减退', '言行紊乱', '脑器质性疾病', '情绪低落', '精神障碍', '情绪差伴躯体不适', '焦虑', '自责']\n",
335
+ "out_list = ['认知行为治疗', '与家人交流障碍', '偶有轻生想法', '长期适应性障碍', '利培酮', '心理治疗', '发呆', '独处时感到被支配', '沉迷学佛后出现精神异常', '与家人交流困难', '急性而短暂的精神病性障碍', '兴趣减退', '有被害妄想和攻击行为', '言语紊乱', '自知力可', '攻击行为', '心脏', '肝脏', '反应慢', '持续的情绪低落', '沉迷学佛', '氢溴酸西酞普兰', '无法胜任家务', '情绪低落', '急性起病', '被害妄想', '社会功能受损', '重度抑郁发作']\n",
336
+ "print(len(tgt_list))\n",
337
+ "print(len(out_list))\n",
338
+ "\n",
339
+ "# 确保CUDA可用\n",
340
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
341
+ "\n",
342
+ "# 将模型移动到GPU\n",
343
+ "bert_model.to(device)\n",
344
+ "\n",
345
+ "# 获取词向量\n",
346
+ "def get_word_embedding(word):\n",
347
+ " # 对单词进行编码\n",
348
+ " input_ids = tokenizer.encode(word, add_special_tokens=True, return_tensors='pt').to(device)\n",
349
+ " # 获取词向量\n",
350
+ " with torch.no_grad():\n",
351
+ " output = bert_model(input_ids)\n",
352
+ " # 使用[CLS]标记的向量作为句子向量\n",
353
+ " # 将结果移回CPU,因为cosine_similarity需要numpy数组\n",
354
+ " return output.last_hidden_state[:, 0, :]\n",
355
+ "\n",
356
+ "# 计算相似性矩阵\n",
357
+ "def calculate_similarity_matrix(words_list1, words_list2):\n",
358
+ " if len(words_list1) > 0 and len(words_list2) > 0:\n",
359
+ " embeddings1 = torch.stack([get_word_embedding(word) for word in words_list1])\n",
360
+ " embeddings2 = torch.stack([get_word_embedding(word) for word in words_list2])\n",
361
+ " \n",
362
+ " # 计算余弦相似性矩阵\n",
363
+ " # similarity_matrix = cosine_similarity(embeddings1.reshape(embeddings1.shape[0],-1), embeddings2.reshape(embeddings2.shape[0],-1))\n",
364
+ " similarity_matrix = cosine_similarity(embeddings1.unsqueeze(1), embeddings2.unsqueeze(0), dim=2).cpu().numpy()\n",
365
+ " \n",
366
+ " else:\n",
367
+ " similarity_matrix = np.zeros((2,2))\n",
368
+ " return similarity_matrix\n",
369
+ "\n",
370
+ "\n",
371
+ "time1 = time.time()\n",
372
+ "# 计算tgt和out列表的相似性矩阵\n",
373
+ "for i in range(50):\n",
374
+ " similarity_matrix = calculate_similarity_matrix(tgt_list, out_list)\n",
375
+ "time2 = time.time()\n",
376
+ "time_cost = time2-time1\n",
377
+ "print(time_cost)\n",
378
+ "\n",
379
+ "# 打印相似性矩阵\n",
380
+ "print(similarity_matrix)\n",
381
+ "print(np.max(np.array(similarity_matrix),axis=1).mean())\n"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 2,
387
+ "metadata": {},
388
+ "outputs": [
389
+ {
390
+ "name": "stdout",
391
+ "output_type": "stream",
392
+ "text": [
393
+ "100\n"
394
+ ]
395
+ }
396
+ ],
397
+ "source": [
398
+ "import numpy as np\n",
399
+ "ner_result = np.load(r\"G:\\code\\R0\\chinese_medical_ner-main\\ccksyidu4k-ner-roformer\\ccksyidu4k-ner-roformer\\ner_result\\PsychClinical\\1shot\\gpt-3.5-turbo_api\\task3ner_result.npy\",allow_pickle=True)\n",
400
+ "print(len(ner_result))"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": 5,
406
+ "metadata": {},
407
+ "outputs": [
408
+ {
409
+ "name": "stdout",
410
+ "output_type": "stream",
411
+ "text": [
412
+ "-------------------------------------------------- task5 --------------------------------------------------\n",
413
+ "gpt-3.5-turbo\n",
414
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/gpt-3.5-turbo_api/task5/ner_result.npy not exist!!!\n",
415
+ "gpt-4o-mini\n",
416
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/gpt-4o-mini_api/task5/ner_result.npy not exist!!!\n",
417
+ "gpt-4\n",
418
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/gpt-4_api/task5/ner_result.npy not exist!!!\n",
419
+ "gemini-1.5-pro\n",
420
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/gemini-1.5-pro_api/task5/ner_result.npy not exist!!!\n",
421
+ "glm4\n",
422
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/glm4_api/task5/ner_result.npy not exist!!!\n",
423
+ "hunyuan-lite\n",
424
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/hunyuan-lite_api/task5/ner_result.npy not exist!!!\n",
425
+ "hunyuan-pro\n",
426
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/hunyuan-pro_api/task5/ner_result.npy not exist!!!\n",
427
+ "minimax\n",
428
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/minimax_api/task5/ner_result.npy not exist!!!\n",
429
+ "spark-4ultra\n",
430
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/spark-4ultra_api/task5/ner_result.npy not exist!!!\n",
431
+ "baichuan4\n",
432
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/baichuan4_api/task5/ner_result.npy not exist!!!\n",
433
+ "deepseek\n",
434
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/deepseek_api/task5/ner_result.npy not exist!!!\n",
435
+ "doubao-pro-32k\n",
436
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/doubao-pro-32k_api/task5/ner_result.npy not exist!!!\n",
437
+ "ernie-4-8k\n",
438
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/ernie-4-8k_api/task5/ner_result.npy not exist!!!\n",
439
+ "moonshot-v1-32k\n",
440
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/moonshot-v1-32k_api/task5/ner_result.npy not exist!!!\n",
441
+ "yi-large\n",
442
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/yi-large_api/task5/ner_result.npy not exist!!!\n",
443
+ "qwen-max\n",
444
+ "Error: G:/code/CMB_wuhu/src/ner_result/PsychClinical/0shot/qwen-max_api/task5/ner_result.npy not exist!!!\n"
445
+ ]
446
+ }
447
+ ],
448
+ "source": [
449
+ "### 计算性能指标\n",
450
+ "\n",
451
+ "import os\n",
452
+ "# model_id指定数据整理的格式 psychAiD与ChatGLM3的格式相同\n",
453
+ "\n",
454
+ "\n",
455
+ "nshot = 0\n",
456
+ "# for task in [1,3,5]:\n",
457
+ "for task in [5]:\n",
458
+ "\n",
459
+ " print('-'*50,'task{}'.format(task),'-'*50)\n",
460
+ "\n",
461
+ " for model in ['gpt-3.5-turbo','gpt-4o-mini','gpt-4','gemini-1.5-pro','glm4','hunyuan-lite','hunyuan-pro','minimax','spark-4ultra','baichuan4','deepseek','doubao-pro-32k','ernie-4-8k','moonshot-v1-32k','yi-large','qwen-max']: \n",
462
+ " # for model in ['baichuan4']: \n",
463
+ " \n",
464
+ " print(model)\n",
465
+ " if task == 5:\n",
466
+ " nshot=0\n",
467
+ " # ans_path = 'G:/code/CMB_0726/result-refined/API/{}shot/task{}_{}.json'.format(nshot,task,model)\n",
468
+ " # dir_out = './ner_result/PsychClinical/{}shot/{}_api/task{}'.format(nshot,model,task)\n",
469
+ "\n",
470
+ " ans_path = 'G:/code/CMB_wuhu/result-refined/API/{}shot/task{}_{}.json'.format(nshot,task,model)\n",
471
+ " dir_out = 'G:/code/CMB_wuhu/src/ner_result/PsychClinical/{}shot/{}_api/task{}'.format(nshot,model,task)\n",
472
+ "\n",
473
+ " # ans_path = 'G:/code/CMB_dali/result-refined/API/{}shot/task{}_{}.json'.format(nshot,task,model)\n",
474
+ " # dir_out = 'G:/code/CMB_dali/src/ner_result/PsychClinical/{}shot/{}_api/task{}'.format(nshot,model,task)\n",
475
+ "\n",
476
+ " \n",
477
+ "\n",
478
+ " \n",
479
+ " ner_path = dir_out +'/ner_result.npy'\n",
480
+ "\n",
481
+ " if not os.path.exists(ner_path):\n",
482
+ " print('Error:',ner_path,'not exist!!!')\n",
483
+ " continue\n",
484
+ " ner_result = np.load(ner_path,allow_pickle=True)\n",
485
+ " ner_scores = []\n",
486
+ " for tgt,out in ner_result:\n",
487
+ " similarity_matrix = calculate_similarity_matrix(tgt, out)\n",
488
+ "\n",
489
+ " # 打印相似性矩阵\n",
490
+ " # print(similarity_matrix)\n",
491
+ " ner_score = np.max(np.array(similarity_matrix),axis=1).mean()\n",
492
+ " ner_scores.append(ner_score)\n",
493
+ " ner_scores_mean = np.mean(ner_scores)\n",
494
+ " ner_scores_std = np.std(ner_scores)\n",
495
+ " print('ner score:{}±{}'.format(ner_scores_mean,ner_scores_std))\n",
496
+ " import json\n",
497
+ "\n",
498
+ " # Load the uploaded JSON file\n",
499
+ " file_path = dir_out +'/metrics.json'\n",
500
+ "\n",
501
+ " # Read the content of the file\n",
502
+ " with open(file_path, 'r') as file:\n",
503
+ " metrics = json.load(file)\n",
504
+ "\n",
505
+ " # Display the content of the JSON file to understand its structure\n",
506
+ " metrics['NER-score'] = {'avg':float(ner_scores_mean),'std':float(ner_scores_std)}\n",
507
+ " with open(file_path, 'w') as file:\n",
508
+ " file.write(json.dumps(metrics))\n",
509
+ " "
510
+ ]
511
+ }
512
+ ],
513
+ "metadata": {
514
+ "kernelspec": {
515
+ "display_name": "py310",
516
+ "language": "python",
517
+ "name": "python3"
518
+ },
519
+ "language_info": {
520
+ "codemirror_mode": {
521
+ "name": "ipython",
522
+ "version": 3
523
+ },
524
+ "file_extension": ".py",
525
+ "mimetype": "text/x-python",
526
+ "name": "python",
527
+ "nbconvert_exporter": "python",
528
+ "pygments_lexer": "ipython3",
529
+ "version": "3.10.13"
530
+ }
531
+ },
532
+ "nbformat": 4,
533
+ "nbformat_minor": 2
534
+ }
chinese_medical_ner/ccksyidu4k-ner-roformer/config.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ maxlen = 300
2
+ epochs = 999
3
+ batch_size = 16
4
+ bert_layers = 12
5
+ crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率
6
+ model_type = 'roformer_v2'
7
+ dropout_rate = 0.1
8
+ max_lr = 1e-5
9
+ lstm_hidden_units = 128
chinese_medical_ner/ccksyidu4k-ner-roformer/cudnn-7.6.5-cuda10.0_0.conda ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:291587fe1bbbff0dc3154f3f5cf9e011b8264d124dedad5f257efa39726a4557
3
+ size 172137578
chinese_medical_ner/ccksyidu4k-ner-roformer/data/chip.train ADDED
The diff for this file is too large to render. See raw diff
 
chinese_medical_ner/ccksyidu4k-ner-roformer/data/chip.validate ADDED
The diff for this file is too large to render. See raw diff
 
chinese_medical_ner/ccksyidu4k-ner-roformer/data/yidu.test ADDED
The diff for this file is too large to render. See raw diff
 
chinese_medical_ner/ccksyidu4k-ner-roformer/data/yidu.train ADDED
The diff for this file is too large to render. See raw diff
 
chinese_medical_ner/ccksyidu4k-ner-roformer/data/yidu.validate ADDED
The diff for this file is too large to render. See raw diff
 
chinese_medical_ner/ccksyidu4k-ner-roformer/evaluate.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! -*- coding: utf-8 -*-
2
+ import os
3
+
4
+ # bert tiny
5
+ import pickle
6
+
7
+ import pandas as pd
8
+ from matplotlib import pyplot as plt
9
+
10
+ from model import BERT
11
+ from path import BASE_CONFIG_NAME, BASE_CKPT_NAME, BASE_MODEL_DIR, train_file_path, test_file_path, val_file_path, \
12
+ weights_path, label_dict_path, categories_f1_path
13
+ from preprocess import load_data, NamedEntityRecognizer
14
+ from plot import f1_plot
15
+
16
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17
+
18
+ from utils.backend import keras, K
19
+ from utils.tokenizers import Tokenizer
20
+ from tqdm import tqdm
21
+
22
+ # save_file_path = "./weights/yidu_bert_tiny_lstm_crf.h5"
23
+
24
+ # bert配置
25
+ config_path = BASE_CONFIG_NAME
26
+ checkpoint_path = BASE_CKPT_NAME
27
+ dict_path = '{}/vocab.txt'.format(BASE_MODEL_DIR)
28
+
29
+
30
+ def get_score(data, NER, tqdm_verbose = False):
31
+ """评测函数
32
+ """
33
+ X, Y, Z = 1e-10, 1e-10, 1e-10
34
+ if tqdm_verbose:
35
+ loop = tqdm(data, ncols = 100)
36
+ for d in loop:
37
+ loop.set_description("Evaluating General F1")
38
+ R = set(NER.recognize(d[0]))
39
+ T = set([tuple(i) for i in d[1:]])
40
+ X += len(R & T)
41
+ Y += len(R)
42
+ Z += len(T)
43
+
44
+ else:
45
+ for d in data:
46
+ R = set(NER.recognize(d[0]))
47
+ T = set([tuple(i) for i in d[1:]])
48
+ X += len(R & T)
49
+ Y += len(R)
50
+ Z += len(T)
51
+ f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
52
+ return f1, precision, recall
53
+
54
+
55
+ def get_catetories_score(data, NER, categories, tqdm_verbose = False):
56
+ """评测函数
57
+ """
58
+ labeded_set = {}
59
+ for i in categories:
60
+ labeded_set[i] = {'TP': 1e-10, 'TP+FP': 1e-10, 'TP+FN': 1e-10}
61
+ if tqdm_verbose:
62
+ loop = tqdm(data, ncols = 100)
63
+ for d in loop:
64
+ loop.set_description("Evaluating F1 of each Categories")
65
+ for i in categories:
66
+ R = set(NER.recognize(d[0]))
67
+ R_labeled = set()
68
+ for s, r, label in R:
69
+ if label == i:
70
+ R_labeled.add((s, r, label))
71
+ T = set([tuple(i) for i in d[1:]])
72
+ T_labeled = set()
73
+ for s, r, label in T:
74
+ if label == i:
75
+ T_labeled.add((s, r, label))
76
+
77
+ labeded_set[i]["TP"] += len(R_labeled & T_labeled)
78
+ labeded_set[i]["TP+FP"] += len(R_labeled)
79
+ labeded_set[i]["TP+FN"] += len(T_labeled)
80
+ # print(labeded_set)
81
+ for i in labeded_set:
82
+ labeded_set[i]["precision"] = round(labeded_set[i]["TP"] / labeded_set[i]["TP+FP"], 4)
83
+ labeded_set[i]["recall"] = round(labeded_set[i]["TP"] / labeded_set[i]["TP+FN"], 4)
84
+ labeded_set[i]["f1"] = round(2 * labeded_set[i]["TP"] / (labeded_set[i]["TP+FP"] + labeded_set[i]["TP+FN"]), 4)
85
+ labeded_set[i]["TP"] = int(labeded_set[i]["TP"])
86
+ labeded_set[i]["TP+FP"] = int(labeded_set[i]["TP+FP"])
87
+ labeded_set[i]["TP+FN"] = int(labeded_set[i]["TP+FN"])
88
+ # f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
89
+ return labeded_set
90
+
91
+
92
+ def evaluate(title, data, CRF, NER):
93
+ trans = K.eval(CRF.trans)
94
+ NER.trans = trans
95
+ f1, precision, recall = get_score(data, NER, tqdm_verbose = True)
96
+ print(title + ': f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall))
97
+ return f1, precision, recall
98
+
99
+
100
+ def evaluate_categories(title, data, categories, CRF, NER):
101
+ trans = K.eval(CRF.trans)
102
+ NER.trans = trans
103
+ result = get_catetories_score(data, NER, categories, tqdm_verbose = True)
104
+ # for i in result:
105
+ # print(i, result[i])
106
+ df = pd.DataFrame(result)
107
+ df = df.T
108
+ df[["TP", "TP+FP", "TP+FN"]] = df[["TP", "TP+FP", "TP+FN"]].astype(int)
109
+ # 设置value的显示长度为200,默认为50
110
+ pd.set_option('max_colwidth', 200)
111
+ # 显示所有列,把行显示设置成最大
112
+ pd.set_option('display.max_columns', None)
113
+ # 显示所有行,把列显示设置成最大
114
+ pd.set_option('display.max_rows', None)
115
+ print(df)
116
+ return df
117
+
118
+
119
+ def evaluate_one(save_file_path, dataset_path, csv_path = categories_f1_path, evaluate_categories_f1 = False):
120
+ with open(label_dict_path, 'rb') as f: # 打开文件
121
+ categories = set(pickle.load(f))
122
+
123
+ bert = BERT(config_path,
124
+ checkpoint_path,
125
+ categories,
126
+ summary = False)
127
+ model = bert.get_model()
128
+
129
+ # 标注数据
130
+ test_data = load_data(dataset_path, categories)
131
+ categories = list(sorted(categories))
132
+
133
+ # 建立分词器
134
+ tokenizer = Tokenizer(dict_path, do_lower_case = True)
135
+
136
+ model.load_weights(save_file_path)
137
+ CRF = bert.get_CRF()
138
+ NER = NamedEntityRecognizer(tokenizer, model, categories, trans = K.eval(CRF.trans), starts = [0], ends = [0])
139
+
140
+ print("\nweight path:" + save_file_path)
141
+ print("evaluate dataset path:" + dataset_path)
142
+ f1, precision, recall = evaluate("General", test_data, CRF, NER)
143
+ if evaluate_categories_f1:
144
+ df = evaluate_categories("Each Categories:", test_data, categories, CRF, NER)
145
+ df.to_csv(csv_path, encoding = 'utf-8-sig')
146
+ return f1, precision, recall
147
+
148
+
149
+ if __name__ == '__main__':
150
+
151
+ evaluate_one(save_file_path = weights_path + '/chip_roformer_v2_base.h5',
152
+ dataset_path = "./data/chip.validate",
153
+ csv_path = './report/chip_bert_base.csv',
154
+ evaluate_categories_f1 = True)
chinese_medical_ner/ccksyidu4k-ner-roformer/evaluate_ner.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import evaluate
2
+ import evaluate
3
+ import io
4
+ import json
5
+ import numpy as np
6
+ import os
7
+ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
8
+ import sys
9
+ import tqdm
10
+ import csv
11
+ # from predict import predict
12
+
13
+
14
+ ## chip ner model
15
+ import pickle
16
+ from model import BERT
17
+ from path import BASE_CONFIG_NAME, BASE_CKPT_NAME, BASE_MODEL_DIR, label_dict_path, weights_path,proj_path
18
+ from preprocess import NamedEntityRecognizer
19
+ from utils.tokenizers import Tokenizer
20
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
21
+ # bert配置
22
+ config_path = BASE_CONFIG_NAME
23
+ checkpoint_path = BASE_CKPT_NAME
24
+ dict_path = '{}/vocab.txt'.format(BASE_MODEL_DIR)
25
+
26
+
27
+
28
+ import argparse
29
+
30
+ CALC_REDUNDANT = False # re-calculate, even if scores already exist
31
+
32
+ def main(label_dict_path, weights_path):
33
+ parser = argparse.ArgumentParser()
34
+ parser.add_argument("--dataset", type=str, help="psych exam", default=5)
35
+ parser.add_argument(
36
+ "--ans_path",
37
+ type=str,
38
+ help="path to the model generated ans file",
39
+ # default='/root/CMB/result/PsychExam/psychAiD/modelans.json'
40
+ # default='/root/CMB/result/PsychExam/chatglm3_6b/modelans_glm3.json'
41
+ # default='/root/CMB/result/PsychExam/chatglm3_6b_32k/modelans_glm3_32k.json'
42
+ # default='/root/CMB/result/PsychExam/psychAiD/modelans_psychAiD_no_sample.json'
43
+ # default='/data/cj_group/shuyu/CMB_0426/result/PsychClinical/llama2/modelans_psychAiD_no_sample.json'
44
+ default='G:/code/CMB_wuhu/result-refined/API/1shot/task1_glm4.json'
45
+ )
46
+
47
+ parser.add_argument(
48
+ "--dir_out",
49
+ type=str,
50
+ help="path to the eval matrics",
51
+ # default='/root/CMB/result/PsychExam/psychAiD/'
52
+ # default='/root/CMB/result/PsychExam/chatglm3_6b/'
53
+ # default='/root/CMB/result/PsychExam/chatglm3_6b_32k/'
54
+ # default='/root/CMB/result/PsychExam/psychAiD/no_sample'
55
+ default='G:/code/CMB_wuhu/src/ner_result/PsychClinical/1shot/glm4_api/task1'
56
+
57
+ )
58
+
59
+ # parse arguments, set data paths
60
+ # args = parser.get_parser()
61
+ args = parser.parse_args()
62
+ is_cxr = True if args.dataset in ['cxr', 'opi'] else False
63
+
64
+ os.makedirs(args.dir_out,exist_ok=True)
65
+
66
+ # load data
67
+ lst_tgt = []
68
+ lst_out = []
69
+ lst_idx = []
70
+ option_qa = [[],[]]
71
+ with open(args.ans_path, "r", encoding="utf-8") as f:
72
+ answers = json.load(f)
73
+ idx = 0
74
+ for ans in answers:
75
+ if 'question_type' not in ans.keys():
76
+ ans['question_type'] = 'clinical'
77
+ if '选择题' in ans['question_type']:
78
+ option_qa[0].append(ans['answer'])
79
+ option_qa[1].append(ans['model_answer'])
80
+ idx += 1
81
+ else:
82
+ if ans['question_type'] == 'clinical':
83
+ if ans['conversations'][1]['from']=='gpt':
84
+ lst_tgt.append(ans['conversations'][1]['value'])
85
+ else:
86
+ lst_tgt.append(ans['conversations'][2]['value'])
87
+ else:
88
+ lst_tgt.append(ans['answer'])
89
+ # lst_out.append(ans['model_answer'])
90
+ if ans['answer_0'] == 'API调用失败':
91
+ continue
92
+ lst_out.append(ans['answer_0'])
93
+
94
+ lst_idx.append(idx)
95
+ idx += 1
96
+
97
+ print('data num:',idx)
98
+
99
+
100
+
101
+ # compute scores of each sample across entire dataset
102
+ scores_all = {}
103
+
104
+
105
+
106
+ # 建立分词器
107
+ weights_path = weights_path + '/chip_roformer_v2_base.h5'
108
+ label_dict_path = label_dict_path
109
+ trans_path = proj_path + "/weights/chip_roformer_v2_crf_trans.pkl"
110
+ print(label_dict_path)
111
+ with open(label_dict_path, 'rb') as f: # 打开文件
112
+ categories = pickle.load(f)
113
+ tokenizer = Tokenizer(dict_path, do_lower_case = True)
114
+
115
+ bert = BERT(config_path,
116
+ checkpoint_path,
117
+ categories,
118
+ summary = False)
119
+ model = bert.get_model()
120
+ print('loading model weights from ',weights_path)
121
+ model.load_weights(weights_path)
122
+ NER = NamedEntityRecognizer(tokenizer, model, categories, trans = pickle.load(open(trans_path, 'rb')), starts = [0],
123
+ ends = [0])
124
+ NER.trans = pickle.load(open(trans_path, 'rb'))
125
+
126
+
127
+ NER_results = []
128
+ NER_cls_results = []
129
+
130
+ for tgt, out, idx in tqdm.tqdm(zip(lst_tgt, lst_out, lst_idx)):
131
+
132
+ # get sub-dict containing scores for each metric
133
+ # scores = compute_scores(tgt, out)
134
+
135
+ entities = []
136
+ for start, end, tag in set(NER.recognize(tgt)):
137
+ entities.append((tgt[start:end + 1], tag, start, end))
138
+ entities = sorted(entities, key = lambda d: d[2])
139
+ tgt_chip_ner_list = entities
140
+
141
+ entities = []
142
+ for start, end, tag in set(NER.recognize(out)):
143
+ entities.append((out[start:end + 1], tag, start, end))
144
+ entities = sorted(entities, key = lambda d: d[2])
145
+ out_chip_ner_list = entities
146
+
147
+ tgt_list = tgt_chip_ner_list
148
+ out_list = out_chip_ner_list
149
+
150
+ # tgt_list = list(set([element[0] for element in tgt_chip_ner_list]))
151
+ # out_list = list(set([element[0] for element in out_chip_ner_list]))
152
+ # tgt_cls_list = list(set([element[1] for element in tgt_chip_ner_list]))
153
+ # out_cls_list = list(set([element[1] for element in out_chip_ner_list]))
154
+
155
+ tgt_list = list([element[0] for element in tgt_chip_ner_list])
156
+ out_list = list([element[0] for element in out_chip_ner_list])
157
+ tgt_cls_list = list([element[1] for element in tgt_chip_ner_list])
158
+ out_cls_list = list([element[1] for element in out_chip_ner_list])
159
+
160
+ NER_results.append([tgt_list,out_list])
161
+ NER_cls_results.append([tgt_cls_list,out_cls_list])
162
+
163
+ print('-'*50)
164
+ print('tgt:',tgt_list)
165
+ print('out:',out_list)
166
+
167
+ # 计算交集,即正确预测的词汇
168
+ intersection = set(tgt_list).intersection(set(out_list))
169
+
170
+ # 计算precision
171
+ precision = len(intersection) / len(out_list) if out_list else 0
172
+
173
+ # 计算recall
174
+ recall = len(intersection) / len(tgt_list) if tgt_list else 0
175
+
176
+ # 计算F1 score
177
+ f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
178
+
179
+
180
+ scores = {
181
+ 'NER-Precision': precision,
182
+ 'NER-Recall': recall,
183
+ 'NER-F1': f1_score,
184
+ }
185
+
186
+ # scale scores to be on [0,100] instead of [0,1]
187
+ for key in scores:
188
+ scores[key] *= 100.
189
+ scores[key] = round(scores[key], 2)
190
+
191
+ # append to master dict, dataset object
192
+ scores_all[idx] = scores
193
+
194
+ # save averaged scores across entire dataset
195
+ write_all_scores(args, scores_all)
196
+ print(scores_all)
197
+
198
+ score_list = []
199
+ for i in range(len(option_qa[0])):
200
+ gt = option_qa[0][i]
201
+ pred = option_qa[1][i]
202
+ if isinstance(gt,list):
203
+ hit = 0
204
+ for apred in pred:
205
+ if apred in gt:
206
+ hit += 1
207
+ score = hit/len(gt)
208
+ else:
209
+ if gt == pred:
210
+ score = 1
211
+ else:
212
+ score = 0
213
+ score_list.append(score)
214
+ acc = np.mean(score_list)
215
+ print('accuracy:',acc)
216
+ os.makedirs(args.dir_out,exist_ok=True)
217
+ np.save(args.dir_out+'/ner_result.npy',NER_results)
218
+ np.save(args.dir_out+'/ner_cls_result.npy',NER_cls_results)
219
+
220
+
221
+
222
+ def compute_scores(tgt, out, metrics='NER', is_cxr=False):
223
+ ''' given output(s), target(s), and a tuple of metrics
224
+ return a scores dict '''
225
+ from path import BASE_CONFIG_NAME, BASE_CKPT_NAME, BASE_MODEL_DIR, label_dict_path, weights_path
226
+
227
+ tgt_chip_ner_list = predict(txt = tgt,
228
+ weights_path = weights_path + '/chip_roformer_v2_base.h5',
229
+ label_dict_path = label_dict_path,
230
+ trans_path = "./weights/chip_roformer_v2_crf_trans.pkl")
231
+
232
+ out_chip_ner_list = predict(txt = tgt,
233
+ weights_path = weights_path + '/chip_roformer_v2_base.h5',
234
+ label_dict_path = label_dict_path,
235
+ trans_path = "./weights/chip_roformer_v2_crf_trans.pkl")
236
+
237
+ tgt_list = tgt_chip_ner_list
238
+ out_list = out_chip_ner_list
239
+ print('-'*50)
240
+ print('tgt:',tgt_list)
241
+ print('out:',out_list)
242
+
243
+ # 计算交集,即正确预测的词汇
244
+ intersection = set(tgt_list).intersection(set(out_list))
245
+
246
+ # 计算precision
247
+ precision = len(intersection) / len(out_list) if out_list else 0
248
+
249
+ # 计算recall
250
+ recall = len(intersection) / len(tgt_list) if tgt_list else 0
251
+
252
+ # 计算F1 score
253
+ f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
254
+
255
+
256
+ scores = {
257
+ 'NER-Precision': precision,
258
+ 'NER-Recall': recall,
259
+ 'NER-F1': f1_score,
260
+ }
261
+
262
+ # scale scores to be on [0,100] instead of [0,1]
263
+ for key in scores:
264
+ scores[key] *= 100.
265
+ scores[key] = round(scores[key], 2)
266
+
267
+ return scores
268
+
269
+
270
+ def write_all_scores(args, scores_all):
271
+ ''' write all scores across dataset to json file
272
+ redundantly write to txt for copy-paste into overleaf '''
273
+
274
+ validate_keys(scores_all) # sanity check
275
+
276
+ # compute avg, std across all samples. write to json
277
+ scores_avg_std = avg_across_samples(scores_all)
278
+ fn_scores_json = os.path.join(args.dir_out, 'metrics.json')
279
+ with open(fn_scores_json, 'w') as f:
280
+ f.write(json.dumps(scores_avg_std))
281
+
282
+ # extract avg, write to txt file
283
+ scores_avg = extract_avg_only(scores_avg_std)
284
+ ss = scores_avg
285
+ txt_out = []
286
+ for key, val in scores_avg.items():
287
+ ss[key] = round(ss[key], 1)
288
+ header = 'NER-Precision & NER-Recall & NER-F1'
289
+ txt_out.append(header)
290
+ str_txt = f'{ss["NER-Precision"]} & {ss["NER-Recall"]} & {ss["NER-F1"]}'
291
+ txt_out.append(str_txt)
292
+ fn_scores_txt = os.path.join(args.dir_out, 'metrics.txt')
293
+ write_list_to_csv(fn_scores_txt, txt_out)
294
+
295
+ return
296
+
297
+
298
+ def avg_across_samples(scores_all):
299
+ ''' average across individual sample scores (sub-dicts) '''
300
+
301
+ scores_avg_std = {}
302
+ keys_to_avg = ["NER-Precision", "NER-Recall", "NER-F1"]
303
+
304
+ for key in keys_to_avg:
305
+ values = [sub_dict[key] for sub_dict in scores_all.values()]
306
+ avg_std = {'avg': round(np.mean(values), 2),
307
+ 'std': round(np.std(values), 2)}
308
+ scores_avg_std[key] = avg_std
309
+
310
+ return scores_avg_std
311
+
312
+
313
+ def extract_avg_only(scores_avg_std):
314
+ ''' extract only values from sub-dict key avg '''
315
+ scores_avg = {}
316
+ for idx in scores_avg_std:
317
+ scores_avg[idx] = scores_avg_std[idx]['avg']
318
+ return scores_avg
319
+
320
+
321
+ def validate_keys(my_dict):
322
+ ''' given dict w sub-dict, validate all sub-dicts have same keys '''
323
+
324
+ sub_dict_keys = None
325
+ for sub_dict in my_dict.values():
326
+ if sub_dict_keys is None:
327
+ sub_dict_keys = set(sub_dict.keys())
328
+ else:
329
+ msg = 'sub-dicts do not contain same keys'
330
+ assert set(sub_dict.keys()) == sub_dict_keys, msg
331
+
332
+ return
333
+
334
+
335
+ # def wrap_str_in_lst(var):
336
+ # if isinstance(var, str):
337
+ # return [var]
338
+ # return var
339
+
340
+ def wrap_str_in_lst(text):
341
+ # 使用 jieba 进行精确分词
342
+ segmented_text = jieba.cut(text, cut_all=False)
343
+ # 直接返回分词结果的列表
344
+ return list(segmented_text)
345
+
346
+
347
+ def write_list_to_csv(fn_csv, list_, csv_action='w'):
348
+ ''' write each element of 1d list to csv
349
+ can also append to existing file w csv_action="a" '''
350
+
351
+ with open(fn_csv, csv_action) as f:
352
+ writer = csv.writer(f, delimiter='\n')
353
+ writer.writerow(list_)
354
+
355
+ return
356
+
357
+
358
+ if __name__ == '__main__':
359
+ main(label_dict_path, weights_path)
chinese_medical_ner/ccksyidu4k-ner-roformer/images/chip_train_acc.png ADDED
chinese_medical_ner/ccksyidu4k-ner-roformer/images/chip_train_loss.png ADDED
chinese_medical_ner/ccksyidu4k-ner-roformer/images/chip_val_f1.png ADDED
chinese_medical_ner/ccksyidu4k-ner-roformer/images/downstream.png ADDED

Git LFS Details

  • SHA256: 406ba53ab27936fe57edee3399801ea27fb58d8d8f35f72c88e6a873414670db
  • Pointer size: 131 Bytes
  • Size of remote file: 540 kB
chinese_medical_ner/ccksyidu4k-ner-roformer/images/model.jpg ADDED

Git LFS Details

  • SHA256: c0e0cbea60550f0f2e9b946f7d2ea712f07b4a62e5724b3e1d51fdd11077b7cd
  • Pointer size: 132 Bytes
  • Size of remote file: 1.77 MB
chinese_medical_ner/ccksyidu4k-ner-roformer/images/yidu_train_acc.png ADDED